mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Basic test coverage for sideloading-style processing
This commit is contained in:
parent
24051fec03
commit
b37223c053
@ -75,7 +75,7 @@ public class DomainProcessor {
|
|||||||
return fullProcessing(domain);
|
return fullProcessing(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ConverterBatchWritableIf sideloadProcessing(SerializableCrawlDataStream dataStream) {
|
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream) {
|
||||||
try {
|
try {
|
||||||
return new SideloadProcessing(dataStream);
|
return new SideloadProcessing(dataStream);
|
||||||
}
|
}
|
||||||
@ -86,7 +86,7 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||||
private final SerializableCrawlDataStream dataStream;
|
private final SerializableCrawlDataStream dataStream;
|
||||||
private final ProcessedDomain domain;
|
private final ProcessedDomain domain;
|
||||||
private final DocumentDecorator documentDecorator;
|
private final DocumentDecorator documentDecorator;
|
||||||
@ -97,10 +97,9 @@ public class DomainProcessor {
|
|||||||
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException {
|
SideloadProcessing(SerializableCrawlDataStream dataStream) throws IOException {
|
||||||
this.dataStream = dataStream;
|
this.dataStream = dataStream;
|
||||||
|
|
||||||
if (!dataStream.hasNext()) {
|
if (!dataStream.hasNext()
|
||||||
throw new IllegalStateException("No data in stream");
|
|| !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||||
}
|
{
|
||||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
|
||||||
throw new IllegalStateException("First record must be a domain");
|
throw new IllegalStateException("First record must be a domain");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,10 +125,11 @@ public class DomainProcessor {
|
|||||||
@Override
|
@Override
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
try {
|
try {
|
||||||
while (next != null
|
while (next == null
|
||||||
&& dataStream.hasNext()
|
&& dataStream.hasNext())
|
||||||
&& dataStream.next() instanceof CrawledDocument doc)
|
|
||||||
{
|
{
|
||||||
|
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||||
|
continue;
|
||||||
if (doc.url == null || !processedUrls.add(doc.url))
|
if (doc.url == null || !processedUrls.add(doc.url))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ public class ConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMemexMarginaliaNu() throws IOException {
|
public void testMemexMarginaliaNuFullProcessing() throws IOException {
|
||||||
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||||
assertNotNull(ret);
|
assertNotNull(ret);
|
||||||
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
||||||
@ -94,6 +94,39 @@ public class ConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
|
||||||
|
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
||||||
|
assertNotNull(ret);
|
||||||
|
assertEquals("memex.marginalia.nu", ret.id());
|
||||||
|
|
||||||
|
var domain = ret.getDomain();
|
||||||
|
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
|
||||||
|
|
||||||
|
List<ProcessedDocument> docsAll = new ArrayList<>();
|
||||||
|
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
||||||
|
ret.getDocumentsStream().forEachRemaining(docsAll::add);
|
||||||
|
assertTrue(docsAll.size() > 25);
|
||||||
|
|
||||||
|
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
|
||||||
|
|
||||||
|
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
||||||
|
|
||||||
|
for (var doc : docsAll) {
|
||||||
|
|
||||||
|
if (!doc.isProcessedFully()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var details = doc.details;
|
||||||
|
|
||||||
|
assertTrue(details.title.length() > 4);
|
||||||
|
assertTrue(details.description.length() > 4);
|
||||||
|
assertEquals(HtmlStandard.HTML5, details.standard);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
|
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
|
||||||
String index = readClassPathFile("memex-marginalia/index");
|
String index = readClassPathFile("memex-marginalia/index");
|
||||||
String[] files = index.split("\n");
|
String[] files = index.split("\n");
|
||||||
|
Loading…
Reference in New Issue
Block a user