Some small crawler tweaks, plus a test for examining crawler behavior through a simulated server.

This commit is contained in:
vlofgren 2022-09-16 16:52:30 +02:00
parent debcd3857a
commit 5e67391829
12 changed files with 312 additions and 11 deletions

View File

@ -0,0 +1,105 @@
package nu.marginalia.wmsa.edge;
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.slf4j.LoggerFactory;
import org.testcontainers.containers.BindMode;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.output.Slf4jLogConsumer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.MountableFile;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
@Tag("e2e")
@Testcontainers
public class EdgeCrawlBehaviorE2ETest extends E2ETestBase {
@Container
public static GenericContainer<?> mockContainer = new GenericContainer<>("openjdk:17-alpine")
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withNetwork(network)
.withNetworkAliases("mock", "mock2")
.withExposedPorts(8080)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("mock")))
.withCommand("java","-cp","WMSA.jar","nu.marginalia.wmsa.edge.crawling.CrawlerTestMain")
;
@Container
public static GenericContainer<?> crawlerContainer = new GenericContainer<>("openjdk:17-alpine")
.dependsOn(mockContainer)
.withNetwork(network)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV")
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("crawl-mock.sh"), "/crawl-mock.sh")
.withFileSystemBind(getMockCrawlPath(), "/crawl/", BindMode.READ_WRITE)
.withCommand("sh", "crawl-mock.sh")
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
private static String getMockCrawlPath() {
Path crawlFiles = getCrawlPath();
List<String> urls = new ArrayList<>();
try {
Files.createDirectories(crawlFiles);
Files.writeString(crawlFiles.resolve("crawl.plan"), """
jobSpec: "/crawl/crawl.spec"
crawl:
dir: "/crawl/crawl"
logName: "crawl.log"
process:
dir: "/crawl/process"
logName: "process.log"
""");
Files.createDirectories(crawlFiles.resolve("crawl"));
Files.createDirectories(crawlFiles.resolve("process"));
Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log"));
Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log"));
CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"),
new CrawlingSpecification("111111", 20, "mock", List.of("http://mock:8080/rate-limit/")),
new CrawlingSpecification("222222", 20, "mock2", List.of("http://mock2:8080/intermittent-error/")));
}
catch (IOException ex) {
ex.printStackTrace();
}
return crawlFiles.toString();
}
public static MountableFile ipDatabasePath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV");
if (!Files.isRegularFile(modelsPath)) {
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
throw new RuntimeException();
}
return MountableFile.forHostPath(modelsPath.toString());
}
private static Path getCrawlPath() {
return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");
}
@Test
public void testRunTheThing() throws IOException {
// This is a test for examining the interaction between the crawler and various
// set-ups
}
}

View File

@ -0,0 +1,19 @@
#!/bin/bash
mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent
cat crawl/crawl.plan
cat << EOF
#### ##### ## # # #
# # # # # # # # #
# # # # # # # #
# ##### ###### # ## # #
# # # # # # ## ## #
#### # # # # # # ######
EOF
java -jar WMSA.jar crawl crawl/crawl.plan
echo "ALL DONE"

View File

@ -9,7 +9,10 @@ import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
import java.util.HashSet;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@Singleton
public class LanguageFilter {
@ -78,7 +81,10 @@ public class LanguageFilter {
}
public boolean isBlockedUnicodeRange(String data) {
return Arrays.stream(UnicodeRanges.values())
.parallel().anyMatch(range -> range.test(data));
for (var range: UnicodeRanges.values()) {
if (range.test(data))
return true;
}
return false;
}
}

View File

@ -68,10 +68,19 @@ public enum UnicodeRanges {
this.max = max;
}
boolean test(String text) {
return text.chars().limit(1000).parallel()
.filter(i -> i >= min && i < max)
.count() >= (sensitive ? 15 : 100);
public boolean test(String text) {
int count = 0;
int max = sensitive ? 15 : 100;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c >= min && c <= max) {
if (count++ > max) {
return true;
}
}
}
return false;
}
}

View File

@ -80,6 +80,7 @@ public enum ServiceDescriptor {
Map<String, Command> functions = Stream.of(new ListCommand(),
new StartCommand(),
new ConvertCommand(),
new CrawlCommand(),
new LoadCommand(),
new ReindexCommand(),
new VersionCommand(),

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.crawling.CrawlerMain;
import java.util.Arrays;
public class CrawlCommand extends Command {
public CrawlCommand() {
super("crawl");
}
@Override
@SneakyThrows
public void execute(String... args) {
if (args.length < 2) {
System.err.println("Usage: crawl plan.yaml");
System.exit(255);
}
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
CrawlerMain.main(args2);
}
}

View File

@ -162,16 +162,23 @@ public class DocumentProcessor {
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
throws DisqualifiedException, URISyntaxException {
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
}
Document doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
}
Document prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
prunedDoc.body().filter(new DomPruningFilter(0.5));
var dld = sentenceExtractor.extractSentences(prunedDoc);

View File

@ -115,6 +115,16 @@ public class CrawlJobExtractorMain {
}
}
public static void writeSpec(Path outFile, CrawlingSpecification... specs) throws IOException {
Gson gson = GsonFactory.get();
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
for (var spec : specs) {
out.println(gson.toJson(spec));
}
}
}
private record DomainWithId(String domainName, int id) {
}

View File

@ -0,0 +1,116 @@
package nu.marginalia.wmsa.edge.crawling;
import io.github.bucket4j.Bandwidth;
import io.github.bucket4j.Bucket;
import io.github.bucket4j.Refill;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
public class CrawlerTestMain {
static Bucket rateLimiter60RPM;
static List<String> successfullyFetched = new ArrayList<>();
public static void main(String... args) {
var refill = Refill.greedy(1, Duration.ofSeconds(1));
var bw = Bandwidth.classic(10, refill);
rateLimiter60RPM = Bucket.builder().addLimit(bw).build();
Spark.port(8080);
Spark.before(CrawlerTestMain::before);
Spark.after(CrawlerTestMain::after);
Spark.get("/rate-limit/", CrawlerTestMain::index);
Spark.get("/rate-limit/:n", CrawlerTestMain::n);
Spark.before("/rate-limit/:n", CrawlerTestMain::rateLimitRequest);
Spark.before("/intermittent-error/:n", CrawlerTestMain::simulateRandomTimeouts);
Spark.get("/intermittent-error/", CrawlerTestMain::index);
Spark.get("/intermittent-error/:n", CrawlerTestMain::n);
}
private static void rateLimitRequest(Request request, Response response) {
if (!rateLimiter60RPM.tryConsume(1)) {
Spark.halt(429);
}
}
private static void simulateRandomTimeouts(Request request, Response response) {
if (Math.random() < 0.25) {
System.out.println("Simulating error");
Spark.halt(503);
}
}
public static void before(Request request, Response response) {
System.out.println(request.pathInfo());
successfullyFetched.add(request.pathInfo());
}
public static void after(Request request, Response response) {
if (response.status() < 300) {
successfullyFetched.add(request.pathInfo());
}
}
private static Object n(Request request, Response response) {
int num = Integer.parseInt(request.params("n"));
return """
<html>
<head>
<title>Index</title>
<body>
<h1>Index</h1>
""" +
String.format("<a href=\"%d\">Next</a>, <a href=\"%d\">Next 2</a>", num+1, num+2)
+
"""
<p>
Goddess, sing me the anger, of Achilles, Peleus son, that fatal anger that brought countless
sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their
bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment.
Sing of it from the moment when Agamemnon, Atreus son, that king of men, parted in wrath from noble Achilles.
Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an
evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest.
He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a
golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the
two leaders of armies, those sons of Atreus: Atreides, and all you bronze-greaved Achaeans, may the gods who
live on Olympus grant you to sack Priams city, and sail back home in safety; but take this ransom, and free
my darling child; show reverence for Zeuss son, far-striking Apollo.
""";
}
private static Object index(Request request, Response response) {
return """
<html>
<head>
<title>Index</title>
<body>
<h1>Index</h1>
<a href="0">Next</a>
<p>
Goddess, sing me the anger, of Achilles, Peleus son, that fatal anger that brought countless
sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their
bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment.
Sing of it from the moment when Agamemnon, Atreus son, that king of men, parted in wrath from noble Achilles.
Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an
evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest.
He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a
golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the
two leaders of armies, those sons of Atreus: Atreides, and all you bronze-greaved Achaeans, may the gods who
live on Olympus grant you to sack Priams city, and sail back home in safety; but take this ransom, and free
my darling child; show reverence for Zeuss son, far-striking Apollo.
""";
}
}

View File

@ -1,7 +1,11 @@
package nu.marginalia.wmsa.edge.crawling.model;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import java.util.List;
@AllArgsConstructor @NoArgsConstructor
public class CrawlingSpecification {
public String id;

View File

@ -28,7 +28,7 @@ import static java.lang.Math.max;
import static java.lang.Math.min;
public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250);
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
@ -75,7 +75,7 @@ public class CrawlerRetreiver {
if (queue.peek() != null) {
var fst = queue.peek();
var root = fst.domain.toRootUrl();
var root = fst.withPathAndParam("/", null);
if (known.add(root.toString()))
queue.addFirst(root);
}
@ -117,7 +117,7 @@ public class CrawlerRetreiver {
.build());
}
var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl());
var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null));
if (!fetchResult.ok()) {
logger.debug("Bad status on {}", domain);
return Optional.of(createErrorPostFromStatus(fetchResult));

View File

@ -111,7 +111,7 @@ public class HttpFetcher {
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgent)
.url(url.domain.toRootUrl().toString())
.url(url.toString())
.build();
var call = client.newCall(head);