mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Fix crawler bug that caused sites to fail to index when no paths were provided.
This commit is contained in:
parent
b348dbb00e
commit
ff30de7352
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||
import org.slf4j.Logger;
|
||||
@ -36,7 +37,8 @@ public class CrawledDomainWriter implements AutoCloseable {
|
||||
return outputFile;
|
||||
}
|
||||
|
||||
public void accept(SerializableCrawlData data) throws IOException {
|
||||
@SneakyThrows
|
||||
public void accept(SerializableCrawlData data) {
|
||||
writer.write(data.getSerialIdentifier());
|
||||
writer.write('\n');
|
||||
gson.toJson(data, writer);
|
||||
|
@ -73,7 +73,7 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||
|
||||
int size = retreiver.fetch();
|
||||
|
||||
|
@ -1,10 +1,12 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.util.bigstring.CompressedBigString;
|
||||
|
||||
@Builder
|
||||
@ToString
|
||||
public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlId;
|
||||
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist;
|
||||
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
|
||||
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -23,6 +24,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@ -43,7 +45,7 @@ public class CrawlerRetreiver {
|
||||
private final int depth;
|
||||
private final String id;
|
||||
private final String domain;
|
||||
private final CrawledDomainWriter crawledDomainWriter;
|
||||
private final Consumer<SerializableCrawlData> crawledDomainWriter;
|
||||
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
||||
@ -62,7 +64,7 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
}
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) {
|
||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
|
||||
this.fetcher = fetcher;
|
||||
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
||||
known = new HashSet<>(specs.urls.size() * 10);
|
||||
@ -79,10 +81,15 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
|
||||
var root = fst.withPathAndParam("/", null);
|
||||
if (known.add(root.toString()))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
else {
|
||||
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
|
||||
}
|
||||
}
|
||||
|
||||
public int fetch() throws IOException {
|
||||
@ -255,7 +262,7 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
|
||||
public boolean isSameDomain(EdgeUrl url) {
|
||||
return domain.equals(url.domain.toString().toLowerCase());
|
||||
return domain.equalsIgnoreCase(url.domain.toString());
|
||||
}
|
||||
|
||||
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||
|
@ -127,7 +127,11 @@ public class HttpFetcher {
|
||||
return new FetchResult(FetchResultState.OK, requestDomain);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
|
||||
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
|
||||
}
|
||||
|
||||
logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
return new FetchResult(FetchResultState.ERROR, url.domain);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.retreival;
|
||||
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Tag("slow")
|
||||
class CrawlerRetreiverTest {
|
||||
|
||||
@Test
|
||||
public void testEmptySet() throws IOException {
|
||||
// Tests the case when there are no URLs provided in the crawl set and the
|
||||
// crawler needs to guess the protocol
|
||||
|
||||
var specs = new CrawlingSpecification("1", 5, "memex.marginalia.nu", new ArrayList<>());
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu");
|
||||
|
||||
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
|
||||
|
||||
Assertions.assertTrue(
|
||||
data.stream().filter(CrawledDocument.class::isInstance)
|
||||
.map(CrawledDocument.class::cast)
|
||||
.filter(doc -> "OK".equals(doc.crawlerStatus))
|
||||
.count() > 1
|
||||
);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user