Fix crawler bug that caused sites to fail to index when no paths were provided.

This commit is contained in:
Viktor Lofgren 2023-02-13 20:26:08 +01:00
parent b348dbb00e
commit ff30de7352
6 changed files with 60 additions and 6 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdOutputStream; import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson; import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData; import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -36,7 +37,8 @@ public class CrawledDomainWriter implements AutoCloseable {
return outputFile; return outputFile;
} }
public void accept(SerializableCrawlData data) throws IOException { @SneakyThrows
public void accept(SerializableCrawlData data) {
writer.write(data.getSerialIdentifier()); writer.write(data.getSerialIdentifier());
writer.write('\n'); writer.write('\n');
gson.toJson(data, writer); gson.toJson(data, writer);

View File

@ -73,7 +73,7 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool); HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) { try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer); var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
int size = retreiver.fetch(); int size = retreiver.fetch();

View File

@ -1,10 +1,12 @@
package nu.marginalia.wmsa.edge.crawling.model; package nu.marginalia.wmsa.edge.crawling.model;
import lombok.Builder; import lombok.Builder;
import lombok.ToString;
import nu.marginalia.util.bigstring.BigString; import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.util.bigstring.CompressedBigString; import nu.marginalia.util.bigstring.CompressedBigString;
@Builder @Builder
@ToString
public class CrawledDocument implements SerializableCrawlData { public class CrawledDocument implements SerializableCrawlData {
public String crawlId; public String crawlId;

View File

@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist;
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList; import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist; import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
import nu.marginalia.wmsa.edge.crawling.model.*; import nu.marginalia.wmsa.edge.crawling.model.*;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -23,6 +24,7 @@ import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Optional; import java.util.Optional;
import java.util.function.Consumer;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
@ -43,7 +45,7 @@ public class CrawlerRetreiver {
private final int depth; private final int depth;
private final String id; private final String id;
private final String domain; private final String domain;
private final CrawledDomainWriter crawledDomainWriter; private final Consumer<SerializableCrawlData> crawledDomainWriter;
private static final LinkParser linkParser = new LinkParser(); private static final LinkParser linkParser = new LinkParser();
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
@ -62,7 +64,7 @@ public class CrawlerRetreiver {
} }
} }
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) { public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
this.fetcher = fetcher; this.fetcher = fetcher;
visited = new HashSet<>((int)(specs.urls.size() * 1.5)); visited = new HashSet<>((int)(specs.urls.size() * 1.5));
known = new HashSet<>(specs.urls.size() * 10); known = new HashSet<>(specs.urls.size() * 10);
@ -79,10 +81,15 @@ public class CrawlerRetreiver {
if (queue.peek() != null) { if (queue.peek() != null) {
var fst = queue.peek(); var fst = queue.peek();
var root = fst.withPathAndParam("/", null); var root = fst.withPathAndParam("/", null);
if (known.add(root.toString())) if (known.add(root.toString()))
queue.addFirst(root); queue.addFirst(root);
} }
else {
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
}
} }
public int fetch() throws IOException { public int fetch() throws IOException {
@ -255,7 +262,7 @@ public class CrawlerRetreiver {
} }
public boolean isSameDomain(EdgeUrl url) { public boolean isSameDomain(EdgeUrl url) {
return domain.equals(url.domain.toString().toLowerCase()); return domain.equalsIgnoreCase(url.domain.toString());
} }
private void findLinks(EdgeUrl baseUrl, Document parsed) { private void findLinks(EdgeUrl baseUrl, Document parsed) {

View File

@ -127,7 +127,11 @@ public class HttpFetcher {
return new FetchResult(FetchResultState.OK, requestDomain); return new FetchResult(FetchResultState.OK, requestDomain);
} }
catch (Exception ex) { catch (Exception ex) {
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
}
logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
return new FetchResult(FetchResultState.ERROR, url.domain); return new FetchResult(FetchResultState.ERROR, url.domain);
} }
} }

View File

@ -0,0 +1,39 @@
package nu.marginalia.wmsa.edge.crawling.retreival;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Tag("slow")
class CrawlerRetreiverTest {
@Test
public void testEmptySet() throws IOException {
// Tests the case when there are no URLs provided in the crawl set and the
// crawler needs to guess the protocol
var specs = new CrawlingSpecification("1", 5, "memex.marginalia.nu", new ArrayList<>());
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu");
List<SerializableCrawlData> data = new ArrayList<>();
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
Assertions.assertTrue(
data.stream().filter(CrawledDocument.class::isInstance)
.map(CrawledDocument.class::cast)
.filter(doc -> "OK".equals(doc.crawlerStatus))
.count() > 1
);
}
}