2023-03-16 20:35:54 +00:00
|
|
|
package nu.marginalia.converting;
|
|
|
|
|
|
|
|
|
|
|
|
import com.google.inject.Guice;
|
|
|
|
import com.google.inject.Injector;
|
2023-09-14 09:21:44 +00:00
|
|
|
import nu.marginalia.model.html.HtmlStandard;
|
2023-04-14 14:56:49 +00:00
|
|
|
import nu.marginalia.converting.model.ProcessedDocument;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.converting.processor.DomainProcessor;
|
2023-07-24 14:28:30 +00:00
|
|
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.crawling.model.CrawledDocument;
|
|
|
|
import nu.marginalia.crawling.model.CrawledDomain;
|
2023-07-24 13:25:09 +00:00
|
|
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.model.EdgeDomain;
|
|
|
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
2023-04-14 14:56:49 +00:00
|
|
|
import nu.marginalia.model.crawl.PubDate;
|
2023-03-16 20:35:54 +00:00
|
|
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
2023-08-18 09:54:56 +00:00
|
|
|
import java.io.*;
|
2023-03-16 20:35:54 +00:00
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.time.LocalTime;
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
import static org.junit.jupiter.api.Assertions.*;
|
|
|
|
|
|
|
|
public class ConvertingIntegrationTest {
|
|
|
|
|
2023-03-17 15:03:11 +00:00
|
|
|
private DomainProcessor domainProcessor;
|
2023-03-16 20:35:54 +00:00
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() {
|
|
|
|
Injector injector = Guice.createInjector(
|
|
|
|
new ConvertingIntegrationTestModule()
|
|
|
|
);
|
|
|
|
|
|
|
|
domainProcessor = injector.getInstance(DomainProcessor.class);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testEmptyDomain() {
|
|
|
|
var docs = new ArrayList<CrawledDocument>();
|
|
|
|
|
2023-09-16 16:14:47 +00:00
|
|
|
var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1",
|
2023-07-24 13:25:09 +00:00
|
|
|
docs, Collections.emptyList());
|
2023-12-27 17:20:03 +00:00
|
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain));
|
2023-03-16 20:35:54 +00:00
|
|
|
|
|
|
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
|
|
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
|
|
assertTrue(ret.documents.isEmpty());
|
|
|
|
}
|
2023-04-14 14:56:49 +00:00
|
|
|
@Test
|
|
|
|
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
|
2023-12-27 17:20:03 +00:00
|
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
2023-04-14 14:56:49 +00:00
|
|
|
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
|
|
|
|
int year = PubDate.fromYearByte(doc.details.metadata.year());
|
|
|
|
Integer yearMeta = doc.details.pubYear;
|
|
|
|
if (yearMeta != null) {
|
|
|
|
assertEquals(year, (int) yearMeta, doc.url.toString());
|
|
|
|
}
|
|
|
|
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2023-03-16 20:35:54 +00:00
|
|
|
@Test
|
2023-12-27 17:33:16 +00:00
|
|
|
public void testMemexMarginaliaNuFullProcessing() throws IOException {
|
2023-12-27 17:20:03 +00:00
|
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
2023-12-15 17:09:53 +00:00
|
|
|
assertNotNull(ret);
|
2023-03-16 20:35:54 +00:00
|
|
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
|
|
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
|
|
|
|
|
|
assertFalse(ret.documents.isEmpty());
|
|
|
|
|
|
|
|
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
|
|
|
|
|
|
|
ret.documents.forEach(doc -> {
|
|
|
|
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
|
|
|
|
});
|
2023-03-17 15:03:11 +00:00
|
|
|
|
|
|
|
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
|
|
|
|
|
|
|
for (var doc : ret.documents) {
|
|
|
|
|
2023-12-27 17:33:16 +00:00
|
|
|
if (!doc.isProcessedFully()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
var details = doc.details;
|
|
|
|
|
|
|
|
assertTrue(details.title.length() > 4);
|
|
|
|
assertTrue(details.description.length() > 4);
|
|
|
|
assertEquals(HtmlStandard.HTML5, details.standard);
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
|
2023-12-30 12:05:10 +00:00
|
|
|
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
|
2023-12-27 17:33:16 +00:00
|
|
|
assertNotNull(ret);
|
|
|
|
assertEquals("memex.marginalia.nu", ret.id());
|
|
|
|
|
|
|
|
var domain = ret.getDomain();
|
|
|
|
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
|
|
|
|
|
|
List<ProcessedDocument> docsAll = new ArrayList<>();
|
|
|
|
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
|
|
|
ret.getDocumentsStream().forEachRemaining(docsAll::add);
|
|
|
|
assertTrue(docsAll.size() > 25);
|
|
|
|
|
|
|
|
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
|
|
|
|
|
|
|
|
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
|
|
|
|
|
|
|
for (var doc : docsAll) {
|
|
|
|
|
2023-03-17 15:03:11 +00:00
|
|
|
if (!doc.isProcessedFully()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
var details = doc.details;
|
|
|
|
|
2023-12-27 18:29:26 +00:00
|
|
|
assertTrue(details.metadata.size() > 0);
|
2023-03-17 15:03:11 +00:00
|
|
|
assertTrue(details.title.length() > 4);
|
|
|
|
assertTrue(details.description.length() > 4);
|
|
|
|
assertEquals(HtmlStandard.HTML5, details.standard);
|
|
|
|
}
|
2023-03-16 20:35:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
|
|
|
|
String index = readClassPathFile("memex-marginalia/index");
|
|
|
|
String[] files = index.split("\n");
|
|
|
|
|
|
|
|
var docs = new ArrayList<CrawledDocument>();
|
|
|
|
|
|
|
|
for (String file : files) {
|
|
|
|
Path p = Path.of("memex-marginalia/").resolve(file);
|
|
|
|
|
|
|
|
var doc = new CrawledDocument("1",
|
|
|
|
"https://memex.marginalia.nu/" + file,
|
|
|
|
"text/html",
|
|
|
|
LocalTime.now().toString(),
|
|
|
|
200,
|
|
|
|
"OK",
|
|
|
|
"",
|
|
|
|
"",
|
2023-07-24 13:25:09 +00:00
|
|
|
readClassPathFile(p.toString()),
|
2023-03-16 20:35:54 +00:00
|
|
|
Double.toString(Math.random()),
|
|
|
|
"https://memex.marginalia.nu/" + file,
|
2023-07-20 19:05:16 +00:00
|
|
|
null,
|
2023-12-15 17:09:53 +00:00
|
|
|
"",
|
2023-12-18 16:45:54 +00:00
|
|
|
false,
|
|
|
|
null,
|
|
|
|
null
|
2023-03-16 20:35:54 +00:00
|
|
|
);
|
|
|
|
docs.add(doc);
|
|
|
|
}
|
|
|
|
|
|
|
|
return new CrawledDomain(
|
|
|
|
"memex.marginalia.nu",
|
|
|
|
null,
|
|
|
|
"OK",
|
|
|
|
"",
|
|
|
|
"127.0.0.1",
|
|
|
|
docs, Collections.emptyList());
|
|
|
|
}
|
|
|
|
|
|
|
|
private String readClassPathFile(String s) throws IOException {
|
|
|
|
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());
|
|
|
|
}
|
|
|
|
|
2023-07-24 13:25:09 +00:00
|
|
|
|
2023-07-24 14:28:30 +00:00
|
|
|
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
|
2023-07-24 13:25:09 +00:00
|
|
|
List<SerializableCrawlData> data = new ArrayList<>();
|
2023-12-27 12:57:59 +00:00
|
|
|
|
|
|
|
data.add(domain);
|
|
|
|
|
2023-07-24 13:25:09 +00:00
|
|
|
if (domain.doc != null) {
|
|
|
|
data.addAll(domain.doc);
|
|
|
|
}
|
2023-12-27 12:57:59 +00:00
|
|
|
|
2023-07-24 14:28:30 +00:00
|
|
|
|
|
|
|
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
2023-07-24 13:25:09 +00:00
|
|
|
}
|
2023-08-18 09:54:56 +00:00
|
|
|
|
2023-03-16 20:35:54 +00:00
|
|
|
}
|