2024-11-19 18:35:01 +00:00
|
|
|
package nu.marginalia.livecrawler;
|
|
|
|
|
|
|
|
import nu.marginalia.model.EdgeUrl;
|
2024-11-20 14:36:25 +00:00
|
|
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
|
|
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
|
|
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
2024-11-19 18:35:01 +00:00
|
|
|
import org.junit.jupiter.api.Assertions;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
2024-11-20 14:36:25 +00:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
2024-11-19 18:35:01 +00:00
|
|
|
|
|
|
|
public class LiveCrawlDataSetTest {
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testGetDataSet() throws Exception {
|
2024-11-20 14:36:25 +00:00
|
|
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
2024-11-21 23:55:46 +00:00
|
|
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
2024-11-19 18:35:01 +00:00
|
|
|
|
|
|
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
|
|
|
dataSet.saveDocument(
|
|
|
|
1,
|
|
|
|
new EdgeUrl("https://www.example.com/"),
|
|
|
|
"test",
|
|
|
|
"test",
|
|
|
|
"test"
|
|
|
|
);
|
|
|
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/"));
|
2024-11-20 14:36:25 +00:00
|
|
|
|
|
|
|
var streams = dataSet.getDataStreams();
|
|
|
|
Assertions.assertEquals(1, streams.size());
|
|
|
|
var stream = streams.iterator().next();
|
|
|
|
|
|
|
|
List<SerializableCrawlData> data = new ArrayList<>();
|
|
|
|
while (stream.hasNext()) {
|
|
|
|
data.add(stream.next());
|
|
|
|
}
|
|
|
|
|
|
|
|
int dataCount = 0;
|
|
|
|
int domainCount = 0;
|
|
|
|
|
|
|
|
for (var item : data) {
|
|
|
|
switch (item) {
|
|
|
|
case CrawledDomain domain -> {
|
|
|
|
domainCount++;
|
|
|
|
Assertions.assertEquals("www.example.com", domain.getDomain());
|
|
|
|
}
|
|
|
|
case CrawledDocument document -> {
|
|
|
|
dataCount++;
|
|
|
|
Assertions.assertEquals("https://www.example.com/", document.url);
|
|
|
|
Assertions.assertEquals("test", document.documentBody);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Assertions.assertEquals(1, dataCount);
|
|
|
|
Assertions.assertEquals(1, domainCount);
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
|
|
|
finally {
|
2024-11-20 14:36:25 +00:00
|
|
|
FileUtils.deleteDirectory(tempDir.toFile());
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
@Test
|
|
|
|
public void testHasUrl() throws Exception {
|
|
|
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
|
|
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
|
|
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
|
|
|
dataSet.saveDocument(
|
|
|
|
1,
|
|
|
|
new EdgeUrl("https://www.example.com/saved"),
|
|
|
|
"test",
|
|
|
|
"test",
|
|
|
|
"test"
|
|
|
|
);
|
|
|
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/saved"));
|
|
|
|
|
|
|
|
dataSet.flagAsBad(new EdgeUrl("https://www.example.com/bad"));
|
|
|
|
|
|
|
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/bad"));
|
|
|
|
|
|
|
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/notPresent"));
|
|
|
|
}
|
|
|
|
finally {
|
|
|
|
FileUtils.deleteDirectory(tempDir.toFile());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|