mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Some small crawler tweaks, plus a test for examining crawler behavior through a simulated server.
This commit is contained in:
parent
debcd3857a
commit
5e67391829
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.wmsa.edge;
|
||||
|
||||
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.testcontainers.containers.BindMode;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||
import org.testcontainers.containers.wait.strategy.Wait;
|
||||
import org.testcontainers.junit.jupiter.Container;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.MountableFile;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Tag("e2e")
|
||||
@Testcontainers
|
||||
public class EdgeCrawlBehaviorE2ETest extends E2ETestBase {
|
||||
@Container
|
||||
public static GenericContainer<?> mockContainer = new GenericContainer<>("openjdk:17-alpine")
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withNetwork(network)
|
||||
.withNetworkAliases("mock", "mock2")
|
||||
.withExposedPorts(8080)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("mock")))
|
||||
.withCommand("java","-cp","WMSA.jar","nu.marginalia.wmsa.edge.crawling.CrawlerTestMain")
|
||||
;
|
||||
|
||||
|
||||
@Container
|
||||
public static GenericContainer<?> crawlerContainer = new GenericContainer<>("openjdk:17-alpine")
|
||||
.dependsOn(mockContainer)
|
||||
.withNetwork(network)
|
||||
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
|
||||
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
|
||||
.withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV")
|
||||
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||
.withCopyFileToContainer(MountableFile.forClasspathResource("crawl-mock.sh"), "/crawl-mock.sh")
|
||||
.withFileSystemBind(getMockCrawlPath(), "/crawl/", BindMode.READ_WRITE)
|
||||
.withCommand("sh", "crawl-mock.sh")
|
||||
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
|
||||
|
||||
|
||||
private static String getMockCrawlPath() {
|
||||
Path crawlFiles = getCrawlPath();
|
||||
|
||||
|
||||
List<String> urls = new ArrayList<>();
|
||||
try {
|
||||
Files.createDirectories(crawlFiles);
|
||||
|
||||
Files.writeString(crawlFiles.resolve("crawl.plan"), """
|
||||
jobSpec: "/crawl/crawl.spec"
|
||||
crawl:
|
||||
dir: "/crawl/crawl"
|
||||
logName: "crawl.log"
|
||||
process:
|
||||
dir: "/crawl/process"
|
||||
logName: "process.log"
|
||||
""");
|
||||
|
||||
Files.createDirectories(crawlFiles.resolve("crawl"));
|
||||
Files.createDirectories(crawlFiles.resolve("process"));
|
||||
Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log"));
|
||||
Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log"));
|
||||
|
||||
CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"),
|
||||
new CrawlingSpecification("111111", 20, "mock", List.of("http://mock:8080/rate-limit/")),
|
||||
new CrawlingSpecification("222222", 20, "mock2", List.of("http://mock2:8080/intermittent-error/")));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
return crawlFiles.toString();
|
||||
}
|
||||
|
||||
|
||||
public static MountableFile ipDatabasePath() {
|
||||
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV");
|
||||
if (!Files.isRegularFile(modelsPath)) {
|
||||
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return MountableFile.forHostPath(modelsPath.toString());
|
||||
}
|
||||
|
||||
private static Path getCrawlPath() {
|
||||
return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRunTheThing() throws IOException {
|
||||
// This is a test for examining the interaction between the crawler and various
|
||||
// set-ups
|
||||
}
|
||||
|
||||
}
|
19
marginalia_nu/src/e2e/resources/crawl-mock.sh
Normal file
19
marginalia_nu/src/e2e/resources/crawl-mock.sh
Normal file
@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
|
||||
mkdir -p /var/lib/wmsa/conf/
|
||||
mkdir -p /var/lib/wmsa/data/
|
||||
|
||||
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
|
||||
|
||||
cat crawl/crawl.plan
|
||||
cat << EOF
|
||||
#### ##### ## # # #
|
||||
# # # # # # # # #
|
||||
# # # # # # # #
|
||||
# ##### ###### # ## # #
|
||||
# # # # # # ## ## #
|
||||
#### # # # # # # ######
|
||||
EOF
|
||||
java -jar WMSA.jar crawl crawl/crawl.plan
|
||||
|
||||
echo "ALL DONE"
|
@ -9,7 +9,10 @@ import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.*;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@Singleton
|
||||
public class LanguageFilter {
|
||||
@ -78,7 +81,10 @@ public class LanguageFilter {
|
||||
}
|
||||
|
||||
public boolean isBlockedUnicodeRange(String data) {
|
||||
return Arrays.stream(UnicodeRanges.values())
|
||||
.parallel().anyMatch(range -> range.test(data));
|
||||
for (var range: UnicodeRanges.values()) {
|
||||
if (range.test(data))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -68,10 +68,19 @@ public enum UnicodeRanges {
|
||||
this.max = max;
|
||||
}
|
||||
|
||||
boolean test(String text) {
|
||||
return text.chars().limit(1000).parallel()
|
||||
.filter(i -> i >= min && i < max)
|
||||
.count() >= (sensitive ? 15 : 100);
|
||||
public boolean test(String text) {
|
||||
int count = 0;
|
||||
int max = sensitive ? 15 : 100;
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (c >= min && c <= max) {
|
||||
if (count++ > max) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -80,6 +80,7 @@ public enum ServiceDescriptor {
|
||||
Map<String, Command> functions = Stream.of(new ListCommand(),
|
||||
new StartCommand(),
|
||||
new ConvertCommand(),
|
||||
new CrawlCommand(),
|
||||
new LoadCommand(),
|
||||
new ReindexCommand(),
|
||||
new VersionCommand(),
|
||||
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlerMain;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class CrawlCommand extends Command {
|
||||
public CrawlCommand() {
|
||||
super("crawl");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void execute(String... args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: crawl plan.yaml");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||
CrawlerMain.main(args2);
|
||||
}
|
||||
}
|
@ -162,16 +162,23 @@ public class DocumentProcessor {
|
||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
|
||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||
}
|
||||
|
||||
Document prunedDoc = doc.clone();
|
||||
|
||||
prunedDoc.getElementsByTag("svg").remove();
|
||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||
|
@ -115,6 +115,16 @@ public class CrawlJobExtractorMain {
|
||||
}
|
||||
}
|
||||
|
||||
public static void writeSpec(Path outFile, CrawlingSpecification... specs) throws IOException {
|
||||
Gson gson = GsonFactory.get();
|
||||
|
||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||
for (var spec : specs) {
|
||||
out.println(gson.toJson(spec));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private record DomainWithId(String domainName, int id) {
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,116 @@
|
||||
package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import io.github.bucket4j.Bandwidth;
|
||||
import io.github.bucket4j.Bucket;
|
||||
import io.github.bucket4j.Refill;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CrawlerTestMain {
|
||||
|
||||
static Bucket rateLimiter60RPM;
|
||||
static List<String> successfullyFetched = new ArrayList<>();
|
||||
|
||||
public static void main(String... args) {
|
||||
var refill = Refill.greedy(1, Duration.ofSeconds(1));
|
||||
|
||||
var bw = Bandwidth.classic(10, refill);
|
||||
rateLimiter60RPM = Bucket.builder().addLimit(bw).build();
|
||||
|
||||
Spark.port(8080);
|
||||
Spark.before(CrawlerTestMain::before);
|
||||
Spark.after(CrawlerTestMain::after);
|
||||
Spark.get("/rate-limit/", CrawlerTestMain::index);
|
||||
Spark.get("/rate-limit/:n", CrawlerTestMain::n);
|
||||
|
||||
Spark.before("/rate-limit/:n", CrawlerTestMain::rateLimitRequest);
|
||||
Spark.before("/intermittent-error/:n", CrawlerTestMain::simulateRandomTimeouts);
|
||||
|
||||
Spark.get("/intermittent-error/", CrawlerTestMain::index);
|
||||
Spark.get("/intermittent-error/:n", CrawlerTestMain::n);
|
||||
|
||||
}
|
||||
|
||||
private static void rateLimitRequest(Request request, Response response) {
|
||||
if (!rateLimiter60RPM.tryConsume(1)) {
|
||||
Spark.halt(429);
|
||||
}
|
||||
}
|
||||
|
||||
private static void simulateRandomTimeouts(Request request, Response response) {
|
||||
if (Math.random() < 0.25) {
|
||||
System.out.println("Simulating error");
|
||||
Spark.halt(503);
|
||||
}
|
||||
}
|
||||
|
||||
public static void before(Request request, Response response) {
|
||||
System.out.println(request.pathInfo());
|
||||
successfullyFetched.add(request.pathInfo());
|
||||
}
|
||||
public static void after(Request request, Response response) {
|
||||
if (response.status() < 300) {
|
||||
successfullyFetched.add(request.pathInfo());
|
||||
}
|
||||
}
|
||||
|
||||
private static Object n(Request request, Response response) {
|
||||
|
||||
int num = Integer.parseInt(request.params("n"));
|
||||
return """
|
||||
<html>
|
||||
<head>
|
||||
<title>Index</title>
|
||||
<body>
|
||||
<h1>Index</h1>
|
||||
""" +
|
||||
String.format("<a href=\"%d\">Next</a>, <a href=\"%d\">Next 2</a>", num+1, num+2)
|
||||
|
||||
+
|
||||
"""
|
||||
|
||||
<p>
|
||||
Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless
|
||||
sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their
|
||||
bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment.
|
||||
|
||||
Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles.
|
||||
Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an
|
||||
evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest.
|
||||
He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a
|
||||
golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the
|
||||
two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who
|
||||
live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free
|
||||
my darling child; show reverence for Zeus’s son, far-striking Apollo.’
|
||||
""";
|
||||
}
|
||||
|
||||
private static Object index(Request request, Response response) {
|
||||
return """
|
||||
<html>
|
||||
<head>
|
||||
<title>Index</title>
|
||||
<body>
|
||||
<h1>Index</h1>
|
||||
<a href="0">Next</a>
|
||||
<p>
|
||||
Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless
|
||||
sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their
|
||||
bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment.
|
||||
|
||||
Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles.
|
||||
Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an
|
||||
evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest.
|
||||
He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a
|
||||
golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the
|
||||
two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who
|
||||
live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free
|
||||
my darling child; show reverence for Zeus’s son, far-striking Apollo.’
|
||||
""";
|
||||
}
|
||||
}
|
@ -1,7 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor
|
||||
public class CrawlingSpecification {
|
||||
public String id;
|
||||
|
||||
|
@ -28,7 +28,7 @@ import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class CrawlerRetreiver {
|
||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
||||
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
@ -75,7 +75,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (queue.peek() != null) {
|
||||
var fst = queue.peek();
|
||||
var root = fst.domain.toRootUrl();
|
||||
var root = fst.withPathAndParam("/", null);
|
||||
if (known.add(root.toString()))
|
||||
queue.addFirst(root);
|
||||
}
|
||||
@ -117,7 +117,7 @@ public class CrawlerRetreiver {
|
||||
.build());
|
||||
}
|
||||
|
||||
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
|
||||
var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null));
|
||||
if (!fetchResult.ok()) {
|
||||
logger.debug("Bad status on {}", domain);
|
||||
return Optional.of(createErrorPostFromStatus(fetchResult));
|
||||
|
@ -111,7 +111,7 @@ public class HttpFetcher {
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
|
||||
.url(url.domain.toRootUrl().toString())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
|
||||
var call = client.newCall(head);
|
||||
|
Loading…
Reference in New Issue
Block a user