MarginaliaSearch/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
Viktor Lofgren 0ca43f0c9c (live-crawler) Improve live crawler short-circuit logic
We should not wait until we've fetched robots.txt to decide whether we have any data to fetch!  This makes the live crawler very slow and leads to unnecessary requests.
2024-12-27 20:54:42 +01:00

66 lines
2.4 KiB
Java

package nu.marginalia.livecrawler;
import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.List;
class SimpleLinkScraperTest {
private Path tempDir;
private LiveCrawlDataSet dataSet;
@BeforeEach
public void setUp() throws IOException, SQLException {
tempDir = Files.createTempDirectory(getClass().getSimpleName());
dataSet = new LiveCrawlDataSet(tempDir);
}
@AfterEach
public void tearDown() throws Exception {
dataSet.close();
FileUtils.deleteDirectory(tempDir.toFile());
}
@Test
public void testRetrieveNow() throws Exception {
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
Assertions.assertEquals(1, fetched);
var streams = dataSet.getDataStreams();
Assertions.assertEquals(1, streams.size());
SerializableCrawlDataStream firstStream = streams.iterator().next();
Assertions.assertTrue(firstStream.hasNext());
List<CrawledDocument> documents = firstStream.docsAsList();
Assertions.assertEquals(1, documents.size());
Assertions.assertTrue(documents.getFirst().documentBody.startsWith("<!doctype"));
}
@Test
public void testRetrieveNow_Redundant() throws Exception {
dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
Assertions.assertEquals(0, fetched);
}
}