Fix bug in CrawlerRetreiver

... where the root URL wasn't always added properly to the front of the crawl queue.
This commit is contained in:
Viktor Lofgren 2023-06-27 15:50:38 +02:00
parent a6a66c6d8a
commit fbdedf53de
4 changed files with 74 additions and 21 deletions

View File

@ -1,11 +1,12 @@
package nu.marginalia.crawling.model.spec; package nu.marginalia.crawling.model.spec;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.util.List; import java.util.List;
@AllArgsConstructor @NoArgsConstructor @AllArgsConstructor @NoArgsConstructor @Builder
public class CrawlingSpecification { public class CrawlingSpecification {
public String id; public String id;

View File

@ -77,7 +77,7 @@ public class CrawlerRetreiver {
// Ensure the index page is always crawled // Ensure the index page is always crawled
var root = fst.withPathAndParam("/", null); var root = fst.withPathAndParam("/", null);
if (crawlFrontier.addKnown(root))
crawlFrontier.addFirst(root); crawlFrontier.addFirst(root);
} }
else { else {

View File

@ -43,12 +43,12 @@ public class DomainCrawlFrontier {
public boolean isEmpty() { public boolean isEmpty() {
return queue.isEmpty(); return queue.isEmpty();
} }
public boolean addKnown(EdgeUrl url) {
return known.contains(url.toString());
}
public void addFirst(EdgeUrl url) { public void addFirst(EdgeUrl url) {
if (known.add(url.toString())) {
queue.addFirst(url); queue.addFirst(url);
} }
}
public EdgeUrl takeNextUrl() { public EdgeUrl takeNextUrl() {
return queue.removeFirst(); return queue.removeFirst();

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawling.retreival; package nu.marginalia.crawling.retreival;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
@ -7,42 +8,93 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertTrue;
@Tag("slow") @Tag("slow")
class CrawlerRetreiverTest { class CrawlerRetreiverTest {
private HttpFetcher httpFetcher;
@BeforeEach
public void setUp() {
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
}
@SneakyThrows
public static void setUpAll() {
// this must be done to avoid java inserting its own user agent for the sitemap requests
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
}
@Test @Test
public void testEmptySet() throws IOException { public void testWithKnownDomains() {
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); var specs = CrawlingSpecification
// Tests the case when there are no URLs provided in the crawl set and the .builder()
// crawler needs to guess the protocol .id("whatever")
.crawlDepth(5)
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>()); .domain("www.marginalia.nu")
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); .build();
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(fetcher, specs, data::add).fetch(); new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.map(doc -> doc.url)
.collect(Collectors.toSet());
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
data.stream().filter(CrawledDocument.class::isInstance) data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast) .map(CrawledDocument.class::cast)
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus)); .forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
/* }
@Test
public void testEmptySet() {
var specs = CrawlingSpecification
.builder()
.id("whatever")
.crawlDepth(5)
.domain("www.marginalia.nu")
.urls(List.of())
.build();
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
var fetchedUrls =
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.map(doc -> doc.url)
.collect(Collectors.toSet());
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
Assertions.assertTrue( Assertions.assertTrue(
data.stream().filter(CrawledDocument.class::isInstance) data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast) .map(CrawledDocument.class::cast)
.filter(doc -> "OK".equals(doc.crawlerStatus)) .anyMatch(doc -> "OK".equals(doc.crawlerStatus))
.count() > 1
); );
*/
} }
} }