MarginaliaSearch/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
Viktor Lofgren d84a2c183f (*) Remove the crawl spec abstraction
The crawl spec abstraction was used to upload lists of domains into the system for future crawling.  This was fairly clunky, and it was difficult to understand what was going to be crawled.

Since a while back, a new domains listing view has been added to the control view that allows direct access to the domains table.  This is much preferred and means the operator can directly manage domains without specs.

This commit removes the crawl spec abstraction from the code, and changes the GUI to direct to the domains list instead.
2024-10-03 13:41:17 +02:00

187 lines
6.1 KiB
Java

package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.model.html.HtmlStandard;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
import java.time.LocalTime;
import java.util.*;
import static org.junit.jupiter.api.Assertions.*;
public class ConvertingIntegrationTest {
private DomainProcessor domainProcessor;
@BeforeEach
public void setUp() {
Injector injector = Guice.createInjector(
new ConvertingIntegrationTestModule()
);
domainProcessor = injector.getInstance(DomainProcessor.class);
}
@Test
public void testEmptyDomain() {
var docs = new ArrayList<CrawledDocument>();
var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1",
docs, Collections.emptyList());
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain));
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
assertTrue(ret.documents.isEmpty());
}
@Test
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
int year = PubDate.fromYearByte(doc.details.metadata.year());
Integer yearMeta = doc.details.pubYear;
if (yearMeta != null) {
assertEquals(year, (int) yearMeta, doc.url.toString());
}
});
}
@Test
public void testMemexMarginaliaNuFullProcessing() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
assertFalse(ret.documents.isEmpty());
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.documents.forEach(doc -> {
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
});
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : ret.documents) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
@Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id());
var domain = ret.getDomain();
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
List<ProcessedDocument> docsAll = new ArrayList<>();
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.getDocumentsStream().forEachRemaining(docsAll::add);
assertTrue(docsAll.size() > 25);
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : docsAll) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.metadata.size() > 0);
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
String index = readClassPathFile("memex-marginalia/index");
String[] files = index.split("\n");
var docs = new ArrayList<CrawledDocument>();
for (String file : files) {
Path p = Path.of("memex-marginalia/").resolve(file);
var doc = new CrawledDocument("1",
"https://memex.marginalia.nu/" + file,
"text/html",
LocalTime.now().toString(),
200,
"OK",
"",
"",
readClassPathFile(p.toString()),
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
"",
false,
null,
null
);
docs.add(doc);
}
return new CrawledDomain(
"memex.marginalia.nu",
null,
"OK",
"",
"127.0.0.1",
docs, Collections.emptyList());
}
private String readClassPathFile(String s) throws IOException {
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());
}
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
List<SerializableCrawlData> data = new ArrayList<>();
data.add(domain);
if (domain.doc != null) {
data.addAll(domain.doc);
}
return SerializableCrawlDataStream.fromIterator(data.iterator());
}
}