MarginaliaSearch/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java

package nu.marginalia.livecrawler;

import nu.marginalia.db.DomainBlacklistImpl;
import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawldata.CrawledDocument;
import org.apache.commons.io.FileUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.List;

class SimpleLinkScraperTest {
    private Path tempDir;
    private LiveCrawlDataSet dataSet;

    @BeforeEach
    public void setUp() throws IOException, SQLException {
        tempDir = Files.createTempDirectory(getClass().getSimpleName());
        dataSet = new LiveCrawlDataSet(tempDir);
    }


    @AfterEach
    public void tearDown() throws Exception {
        dataSet.close();
        FileUtils.deleteDirectory(tempDir.toFile());
    }

    @Test
    public void testRetrieveNow() throws Exception {
        var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
        Assertions.assertEquals(1, fetched);

        var streams = dataSet.getDataStreams();
        Assertions.assertEquals(1, streams.size());

        SerializableCrawlDataStream firstStream = streams.iterator().next();
        Assertions.assertTrue(firstStream.hasNext());

        List<CrawledDocument> documents = firstStream.docsAsList();
        Assertions.assertEquals(1, documents.size());
        Assertions.assertTrue(documents.getFirst().documentBody().startsWith("<!doctype"));
    }


    @Test
    public void testRetrieveNow_Redundant() throws Exception {
        dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");
        var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));

        // If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything
        int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
        Assertions.assertEquals(0, fetched);
    }
}
(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00			`package nu.marginalia.livecrawler;`

			`import nu.marginalia.db.DomainBlacklistImpl;`
			`import nu.marginalia.io.SerializableCrawlDataStream;`
			`import nu.marginalia.model.EdgeDomain;`
(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`import nu.marginalia.model.EdgeUrl;`
(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00			`import nu.marginalia.model.crawldata.CrawledDocument;`
			`import org.apache.commons.io.FileUtils;`
			`import org.junit.jupiter.api.AfterEach;`
			`import org.junit.jupiter.api.Assertions;`
			`import org.junit.jupiter.api.BeforeEach;`
			`import org.junit.jupiter.api.Test;`
			`import org.mockito.Mockito;`

			`import java.io.IOException;`
			`import java.nio.file.Files;`
			`import java.nio.file.Path;`
			`import java.sql.SQLException;`
			`import java.util.List;`

			`class SimpleLinkScraperTest {`
			`private Path tempDir;`
			`private LiveCrawlDataSet dataSet;`

			`@BeforeEach`
			`public void setUp() throws IOException, SQLException {`
			`tempDir = Files.createTempDirectory(getClass().getSimpleName());`
			`dataSet = new LiveCrawlDataSet(tempDir);`
			`}`


			`@AfterEach`
			`public void tearDown() throws Exception {`
			`dataSet.close();`
			`FileUtils.deleteDirectory(tempDir.toFile());`
			`}`

			`@Test`
			`public void testRetrieveNow() throws Exception {`
			`var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));`
(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));`
			`Assertions.assertEquals(1, fetched);`
(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00
			`var streams = dataSet.getDataStreams();`
			`Assertions.assertEquals(1, streams.size());`

			`SerializableCrawlDataStream firstStream = streams.iterator().next();`
			`Assertions.assertTrue(firstStream.hasNext());`

(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`List<CrawledDocument> documents = firstStream.docsAsList();`
			`Assertions.assertEquals(1, documents.size());`
Merge branch 'master' into slop-crawl-data-spike 2025-01-21 11:50:12 +00:00			`Assertions.assertTrue(documents.getFirst().documentBody().startsWith("<!doctype"));`
(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`}`

(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00

(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`@Test`
			`public void testRetrieveNow_Redundant() throws Exception {`
			`dataSet.saveDocument(1, new EdgeUrl("https://www.marginalia.nu/"), "<html>", "", "127.0.0.1");`
			`var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));`
(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00
(live-crawler) Improve live crawler short-circuit logic We should not wait until we've fetched robots.txt to decide whether we have any data to fetch! This makes the live crawler very slow and leads to unnecessary requests. 2024-12-27 19:54:42 +00:00			`// If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything`
			`int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));`
			`Assertions.assertEquals(0, fetched);`
(live-crawler) Add Accept-Encoding: gzip to outbound requests This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional. 2024-12-27 02:59:34 +00:00			`}`
			`}`