2024-12-27 02:59:34 +00:00
|
|
|
package nu.marginalia.livecrawler;
|
|
|
|
|
|
|
|
import nu.marginalia.db.DomainBlacklistImpl;
|
|
|
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
|
|
|
import nu.marginalia.model.EdgeDomain;
|
2024-12-27 19:54:42 +00:00
|
|
|
import nu.marginalia.model.EdgeUrl;
|
2024-12-27 02:59:34 +00:00
|
|
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
|
|
import org.junit.jupiter.api.AfterEach;
|
|
|
|
import org.junit.jupiter.api.Assertions;
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import org.mockito.Mockito;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.sql.SQLException;
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
class SimpleLinkScraperTest {
|
|
|
|
private Path tempDir;
|
|
|
|
private LiveCrawlDataSet dataSet;
|
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() throws IOException, SQLException {
|
|
|
|
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
|
|
|
dataSet = new LiveCrawlDataSet(tempDir);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@AfterEach
|
|
|
|
public void tearDown() throws Exception {
|
|
|
|
dataSet.close();
|
|
|
|
FileUtils.deleteDirectory(tempDir.toFile());
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testRetrieveNow() throws Exception {
|
|
|
|
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
2024-12-27 19:54:42 +00:00
|
|
|
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
|
|
|
Assertions.assertEquals(1, fetched);
|
2024-12-27 02:59:34 +00:00
|
|
|
|
|
|
|
var streams = dataSet.getDataStreams();
|
|
|
|
Assertions.assertEquals(1, streams.size());
|
|
|
|
|
|
|
|
SerializableCrawlDataStream firstStream = streams.iterator().next();
|
|
|
|
Assertions.assertTrue(firstStream.hasNext());
|
|
|
|
|
2024-12-27 19:54:42 +00:00
|
|
|
List<CrawledDocument> documents = firstStream.docsAsList();
|
|
|
|
Assertions.assertEquals(1, documents.size());
|
2025-01-21 11:50:12 +00:00
|
|
|
Assertions.assertTrue(documents.getFirst().documentBody().startsWith("<!doctype"));
|
2024-12-27 19:54:42 +00:00
|
|
|
}
|
|
|
|
|
2024-12-27 02:59:34 +00:00
|
|
|
|
|
|
|
|
2024-12-27 19:54:42 +00:00
|
|
|
@Test
|
|
|
|
public void testRetrieveNow_Redundant() throws Exception {
|
|
|
|
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
|
|
|
|
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
2024-12-27 02:59:34 +00:00
|
|
|
|
2024-12-27 19:54:42 +00:00
|
|
|
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
|
|
|
|
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
|
|
|
Assertions.assertEquals(0, fetched);
|
2024-12-27 02:59:34 +00:00
|
|
|
}
|
|
|
|
}
|