mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Fix crawler bug that caused sites to fail to index when no paths were provided.
This commit is contained in:
parent
b348dbb00e
commit
ff30de7352
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -36,7 +37,8 @@ public class CrawledDomainWriter implements AutoCloseable {
|
|||||||
return outputFile;
|
return outputFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void accept(SerializableCrawlData data) throws IOException {
|
@SneakyThrows
|
||||||
|
public void accept(SerializableCrawlData data) {
|
||||||
writer.write(data.getSerialIdentifier());
|
writer.write(data.getSerialIdentifier());
|
||||||
writer.write('\n');
|
writer.write('\n');
|
||||||
gson.toJson(data, writer);
|
gson.toJson(data, writer);
|
||||||
|
@ -73,7 +73,7 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
|
|
||||||
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
HttpFetcher fetcher = new HttpFetcher(userAgent.uaString(), dispatcher, connectionPool);
|
||||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer);
|
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||||
|
|
||||||
int size = retreiver.fetch();
|
int size = retreiver.fetch();
|
||||||
|
|
||||||
|
@ -1,10 +1,12 @@
|
|||||||
package nu.marginalia.wmsa.edge.crawling.model;
|
package nu.marginalia.wmsa.edge.crawling.model;
|
||||||
|
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
|
import lombok.ToString;
|
||||||
import nu.marginalia.util.bigstring.BigString;
|
import nu.marginalia.util.bigstring.BigString;
|
||||||
import nu.marginalia.util.bigstring.CompressedBigString;
|
import nu.marginalia.util.bigstring.CompressedBigString;
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
|
@ToString
|
||||||
public class CrawledDocument implements SerializableCrawlData {
|
public class CrawledDocument implements SerializableCrawlData {
|
||||||
public String crawlId;
|
public String crawlId;
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.wmsa.edge.crawling.blocklist.GeoIpBlocklist;
|
|||||||
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
|
import nu.marginalia.wmsa.edge.crawling.blocklist.IpBlockList;
|
||||||
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
|
import nu.marginalia.wmsa.edge.crawling.blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.*;
|
import nu.marginalia.wmsa.edge.crawling.model.*;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -23,6 +24,7 @@ import java.util.ArrayList;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@ -43,7 +45,7 @@ public class CrawlerRetreiver {
|
|||||||
private final int depth;
|
private final int depth;
|
||||||
private final String id;
|
private final String id;
|
||||||
private final String domain;
|
private final String domain;
|
||||||
private final CrawledDomainWriter crawledDomainWriter;
|
private final Consumer<SerializableCrawlData> crawledDomainWriter;
|
||||||
|
|
||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class);
|
||||||
@ -62,7 +64,7 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, CrawledDomainWriter writer) {
|
public CrawlerRetreiver(HttpFetcher fetcher, CrawlingSpecification specs, Consumer<SerializableCrawlData> writer) {
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
visited = new HashSet<>((int)(specs.urls.size() * 1.5));
|
||||||
known = new HashSet<>(specs.urls.size() * 10);
|
known = new HashSet<>(specs.urls.size() * 10);
|
||||||
@ -79,10 +81,15 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
if (queue.peek() != null) {
|
if (queue.peek() != null) {
|
||||||
var fst = queue.peek();
|
var fst = queue.peek();
|
||||||
|
|
||||||
var root = fst.withPathAndParam("/", null);
|
var root = fst.withPathAndParam("/", null);
|
||||||
if (known.add(root.toString()))
|
if (known.add(root.toString()))
|
||||||
queue.addFirst(root);
|
queue.addFirst(root);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
||||||
|
addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fetch() throws IOException {
|
public int fetch() throws IOException {
|
||||||
@ -255,7 +262,7 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSameDomain(EdgeUrl url) {
|
public boolean isSameDomain(EdgeUrl url) {
|
||||||
return domain.equals(url.domain.toString().toLowerCase());
|
return domain.equalsIgnoreCase(url.domain.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||||
|
@ -127,7 +127,11 @@ public class HttpFetcher {
|
|||||||
return new FetchResult(FetchResultState.OK, requestDomain);
|
return new FetchResult(FetchResultState.OK, requestDomain);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.debug("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
if (url.proto.equalsIgnoreCase("http") && "/".equals(url.path)) {
|
||||||
|
return probeDomain(new EdgeUrl("https", url.domain, url.port, url.path, url.param));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
return new FetchResult(FetchResultState.ERROR, url.domain);
|
return new FetchResult(FetchResultState.ERROR, url.domain);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.crawling.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Tag("slow")
|
||||||
|
class CrawlerRetreiverTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testEmptySet() throws IOException {
|
||||||
|
// Tests the case when there are no URLs provided in the crawl set and the
|
||||||
|
// crawler needs to guess the protocol
|
||||||
|
|
||||||
|
var specs = new CrawlingSpecification("1", 5, "memex.marginalia.nu", new ArrayList<>());
|
||||||
|
|
||||||
|
HttpFetcher fetcher = new HttpFetcher("test.marginalia.nu");
|
||||||
|
|
||||||
|
|
||||||
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
|
new CrawlerRetreiver(fetcher, specs, data::add).fetch();
|
||||||
|
|
||||||
|
Assertions.assertTrue(
|
||||||
|
data.stream().filter(CrawledDocument.class::isInstance)
|
||||||
|
.map(CrawledDocument.class::cast)
|
||||||
|
.filter(doc -> "OK".equals(doc.crawlerStatus))
|
||||||
|
.count() > 1
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user