Fix crawler bug that caused sites to fail to index when no paths were provided.

This commit is contained in:
Viktor Lofgren 2023-02-13 20:26:08 +01:00
parent b348dbb00e
commit ff30de7352
6 changed files with 60 additions and 6 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
import org.slf4j.Logger;
@ -36,7 +37,8 @@ public class CrawledDomainWriter implements AutoCloseable {
return outputFile;
}
public void accept(SerializableCrawlData data) throws IOException {
@SneakyThrows
public void accept(SerializableCrawlData data) {
writer.write(data.getSerialIdentifier());
writer.write('\n');
gson.toJson(data, writer);

View File

@ -73,7 +73,7 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
int size = retreiver.fetch();

View File

@ -1,10 +1,12 @@
package nu.marginalia.wmsa.edge.crawling.model;
import lombok.Builder;
import lombok.ToString;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.util.bigstring.CompressedBigString;
@Builder
@ToString
public class CrawledDocument implements SerializableCrawlData {
public String crawlId;

View File

@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist;
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
import nu.marginalia.wmsa.edge.crawling.model.*;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -23,6 +24,7 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Optional;
import java.util.function.Consumer;
import static java.lang.Math.max;
import static java.lang.Math.min;
@ -43,7 +45,7 @@ public class CrawlerRetreiver {
private final int depth;
private final String id;
private final String domain;
private final CrawledDomainWriter crawledDomainWriter;
private final Consumer<SerializableCrawlData> crawledDomainWriter;
private static final LinkParser linkParser = new LinkParser();
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
@ -62,7 +64,7 @@ public class CrawlerRetreiver {
}
}
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) {
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
this.fetcher = fetcher;
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
known = new HashSet<>(specs.urls.size() * 10);
@ -79,10 +81,15 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
var root = fst.withPathAndParam("/", null);
if (known.add(root.toString()))
queue.addFirst(root);
}
else {
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
}
}
public int fetch() throws IOException {
@ -255,7 +262,7 @@ public class CrawlerRetreiver {
}
public boolean isSameDomain(EdgeUrl url) {
return domain.equals(url.domain.toString().toLowerCase());
return domain.equalsIgnoreCase(url.domain.toString());
}
private void findLinks(EdgeUrl baseUrl, Document parsed) {

View File

@ -127,7 +127,11 @@ public class HttpFetcher {
return new FetchResult(FetchResultState.OK, requestDomain);
}
catch (Exception ex) {
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
}
logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return new FetchResult(FetchResultState.ERROR, url.domain);
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.wmsa.edge.crawling.retreival;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Tag("slow")
class CrawlerRetreiverTest {
@Test
public void testEmptySet() throws IOException {
// Tests the case when there are no URLs provided in the crawl set and the
// crawler needs to guess the protocol
var specs = new CrawlingSpecification("1", 5, "memex.marginalia.nu", new ArrayList<>());
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu");
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
Assertions.assertTrue(
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.filter(doc -> "OK".equals(doc.crawlerStatus))
.count() > 1
);
}
}