mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Fix bug in CrawlerRetreiver
... where the root URL wasn't always added properly to the front of the crawl queue.
This commit is contained in:
parent
a6a66c6d8a
commit
fbdedf53de
@ -1,11 +1,12 @@
|
|||||||
package nu.marginalia.crawling.model.spec;
|
package nu.marginalia.crawling.model.spec;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@AllArgsConstructor @NoArgsConstructor
|
@AllArgsConstructor @NoArgsConstructor @Builder
|
||||||
public class CrawlingSpecification {
|
public class CrawlingSpecification {
|
||||||
public String id;
|
public String id;
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
// Ensure the index page is always crawled
|
// Ensure the index page is always crawled
|
||||||
var root = fst.withPathAndParam("/", null);
|
var root = fst.withPathAndParam("/", null);
|
||||||
if (crawlFrontier.addKnown(root))
|
|
||||||
crawlFrontier.addFirst(root);
|
crawlFrontier.addFirst(root);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -43,12 +43,12 @@ public class DomainCrawlFrontier {
|
|||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return queue.isEmpty();
|
return queue.isEmpty();
|
||||||
}
|
}
|
||||||
public boolean addKnown(EdgeUrl url) {
|
|
||||||
return known.contains(url.toString());
|
|
||||||
}
|
|
||||||
public void addFirst(EdgeUrl url) {
|
public void addFirst(EdgeUrl url) {
|
||||||
|
if (known.add(url.toString())) {
|
||||||
queue.addFirst(url);
|
queue.addFirst(url);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public EdgeUrl takeNextUrl() {
|
public EdgeUrl takeNextUrl() {
|
||||||
return queue.removeFirst();
|
return queue.removeFirst();
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawling.retreival;
|
package nu.marginalia.crawling.retreival;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
@ -7,42 +8,93 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
|||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class CrawlerRetreiverTest {
|
class CrawlerRetreiverTest {
|
||||||
|
private HttpFetcher httpFetcher;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() {
|
||||||
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public static void setUpAll() {
|
||||||
|
// this must be done to avoid java inserting its own user agent for the sitemap requests
|
||||||
|
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testEmptySet() throws IOException {
|
public void testWithKnownDomains() {
|
||||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
var specs = CrawlingSpecification
|
||||||
// Tests the case when there are no URLs provided in the crawl set and the
|
.builder()
|
||||||
// crawler needs to guess the protocol
|
.id("whatever")
|
||||||
|
.crawlDepth(5)
|
||||||
var specs = new CrawlingSpecification("1", 5, "www.marginalia.nu", new ArrayList<>());
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/"))
|
||||||
HttpFetcher fetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
.build();
|
||||||
|
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
|
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||||
|
|
||||||
|
var fetchedUrls =
|
||||||
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
|
.map(CrawledDocument.class::cast)
|
||||||
|
.map(doc -> doc.url)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
|
||||||
|
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/"));
|
||||||
|
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
.map(CrawledDocument.class::cast)
|
.map(CrawledDocument.class::cast)
|
||||||
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
||||||
|
|
||||||
/*
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmptySet() {
|
||||||
|
|
||||||
|
var specs = CrawlingSpecification
|
||||||
|
.builder()
|
||||||
|
.id("whatever")
|
||||||
|
.crawlDepth(5)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of())
|
||||||
|
.build();
|
||||||
|
|
||||||
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
|
new CrawlerRetreiver(httpFetcher, specs, data::add).fetch();
|
||||||
|
|
||||||
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
|
.map(CrawledDocument.class::cast)
|
||||||
|
.forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus));
|
||||||
|
|
||||||
|
var fetchedUrls =
|
||||||
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
|
.map(CrawledDocument.class::cast)
|
||||||
|
.map(doc -> doc.url)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
|
assertTrue(fetchedUrls.contains("https://www.marginalia.nu/"));
|
||||||
|
|
||||||
Assertions.assertTrue(
|
Assertions.assertTrue(
|
||||||
data.stream().filter(CrawledDocument.class::isInstance)
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
.map(CrawledDocument.class::cast)
|
.map(CrawledDocument.class::cast)
|
||||||
.filter(doc -> "OK".equals(doc.crawlerStatus))
|
.anyMatch(doc -> "OK".equals(doc.crawlerStatus))
|
||||||
.count() > 1
|
|
||||||
);
|
);
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user