From 5e67391829e4ffc7b5283aaf76645a1c3e90c6f8 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 16 Sep 2022 16:52:30 +0200 Subject: [PATCH] Some small crawler tweaks, plus a test for examining crawler behavior through a simulated server. --- .../wmsa/edge/EdgeCrawlBehaviorE2ETest.java | 105 ++++++++++++++++ marginalia_nu/src/e2e/resources/crawl-mock.sh | 19 +++ .../util/language/LanguageFilter.java | 12 +- .../util/language/UnicodeRanges.java | 17 ++- .../wmsa/configuration/ServiceDescriptor.java | 1 + .../configuration/command/CrawlCommand.java | 24 ++++ .../processor/DocumentProcessor.java | 7 ++ .../edge/crawling/CrawlJobExtractorMain.java | 10 ++ .../wmsa/edge/crawling/CrawlerTestMain.java | 116 ++++++++++++++++++ .../crawling/model/CrawlingSpecification.java | 4 + .../crawling/retreival/CrawlerRetreiver.java | 6 +- .../edge/crawling/retreival/HttpFetcher.java | 2 +- 12 files changed, 312 insertions(+), 11 deletions(-) create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java create mode 100644 marginalia_nu/src/e2e/resources/crawl-mock.sh create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java new file mode 100644 index 00000000..f8325d9d --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeCrawlBehaviorE2ETest.java @@ -0,0 +1,105 @@ +package nu.marginalia.wmsa.edge; + + +import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; +import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +@Tag("e2e") +@Testcontainers +public class EdgeCrawlBehaviorE2ETest extends E2ETestBase { + @Container + public static GenericContainer mockContainer = new GenericContainer<>("openjdk:17-alpine") + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withNetwork(network) + .withNetworkAliases("mock", "mock2") + .withExposedPorts(8080) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("mock"))) + .withCommand("java","-cp","WMSA.jar","nu.marginalia.wmsa.edge.crawling.CrawlerTestMain") + ; + + + @Container + public static GenericContainer crawlerContainer = new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mockContainer) + .withNetwork(network) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) + .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV") + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("crawl-mock.sh"), "/crawl-mock.sh") + .withFileSystemBind(getMockCrawlPath(), "/crawl/", BindMode.READ_WRITE) + .withCommand("sh", "crawl-mock.sh") + .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); + + + private static String getMockCrawlPath() { + Path crawlFiles = getCrawlPath(); + + + List urls = new ArrayList<>(); + try { + Files.createDirectories(crawlFiles); + + Files.writeString(crawlFiles.resolve("crawl.plan"), """ + jobSpec: "/crawl/crawl.spec" + crawl: + dir: "/crawl/crawl" + logName: "crawl.log" + process: + dir: "/crawl/process" + logName: "process.log" + """); + + Files.createDirectories(crawlFiles.resolve("crawl")); + Files.createDirectories(crawlFiles.resolve("process")); + Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log")); + Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log")); + + CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), + new CrawlingSpecification("111111", 20, "mock", List.of("http://mock:8080/rate-limit/")), + new CrawlingSpecification("222222", 20, "mock2", List.of("http://mock2:8080/intermittent-error/"))); + } + catch (IOException ex) { + ex.printStackTrace(); + } + return crawlFiles.toString(); + } + + + public static MountableFile ipDatabasePath() { + Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); + if (!Files.isRegularFile(modelsPath)) { + System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); + throw new RuntimeException(); + } + return MountableFile.forHostPath(modelsPath.toString()); + } + + private static Path getCrawlPath() { + return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); + } + + @Test + public void testRunTheThing() throws IOException { + // This is a test for examining the interaction between the crawler and various + // set-ups + } + +} diff --git a/marginalia_nu/src/e2e/resources/crawl-mock.sh b/marginalia_nu/src/e2e/resources/crawl-mock.sh new file mode 100644 index 00000000..4270929e --- /dev/null +++ b/marginalia_nu/src/e2e/resources/crawl-mock.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +mkdir -p /var/lib/wmsa/conf/ +mkdir -p /var/lib/wmsa/data/ + +echo "search.marginalia.nu" > /var/lib/wmsa/conf/user-agent + +cat crawl/crawl.plan +cat << EOF + #### ##### ## # # # + # # # # # # # # # + # # # # # # # # + # ##### ###### # ## # # + # # # # # # ## ## # + #### # # # # # # ###### +EOF +java -jar WMSA.jar crawl crawl/crawl.plan + +echo "ALL DONE" \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java index c3653bc9..7649201b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/LanguageFilter.java @@ -9,7 +9,10 @@ import javax.inject.Inject; import javax.inject.Singleton; import java.io.BufferedReader; import java.io.InputStreamReader; -import java.util.*; +import java.util.HashSet; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; @Singleton public class LanguageFilter { @@ -78,7 +81,10 @@ public class LanguageFilter { } public boolean isBlockedUnicodeRange(String data) { - return Arrays.stream(UnicodeRanges.values()) - .parallel().anyMatch(range -> range.test(data)); + for (var range: UnicodeRanges.values()) { + if (range.test(data)) + return true; + } + return false; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java index ef46ee0b..e5c22406 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java @@ -68,10 +68,19 @@ public enum UnicodeRanges { this.max = max; } - boolean test(String text) { - return text.chars().limit(1000).parallel() - .filter(i -> i >= min && i < max) - .count() >= (sensitive ? 15 : 100); + public boolean test(String text) { + int count = 0; + int max = sensitive ? 15 : 100; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (c >= min && c <= max) { + if (count++ > max) { + return true; + } + } + } + + return false; } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 4fc59afe..b06a05a2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -80,6 +80,7 @@ public enum ServiceDescriptor { Map functions = Stream.of(new ListCommand(), new StartCommand(), new ConvertCommand(), + new CrawlCommand(), new LoadCommand(), new ReindexCommand(), new VersionCommand(), diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java new file mode 100644 index 00000000..07c291bb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/CrawlCommand.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.configuration.command; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.crawling.CrawlerMain; + +import java.util.Arrays; + +public class CrawlCommand extends Command { + public CrawlCommand() { + super("crawl"); + } + + @Override + @SneakyThrows + public void execute(String... args) { + if (args.length < 2) { + System.err.println("Usage: crawl plan.yaml"); + System.exit(255); + } + + String[] args2 = Arrays.copyOfRange(args, 1, args.length); + CrawlerMain.main(args2); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 5037c791..99c344cd 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -162,16 +162,23 @@ public class DocumentProcessor { private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { + if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) { + throw new DisqualifiedException(DisqualificationReason.LANGUAGE); + } + Document doc = Jsoup.parse(crawledDocument.documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } + if (doc.select("meta[name=robots]").attr("content").contains("noindex")) { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } Document prunedDoc = doc.clone(); + + prunedDoc.getElementsByTag("svg").remove(); prunedDoc.body().filter(new DomPruningFilter(0.5)); var dld = sentenceExtractor.extractSentences(prunedDoc); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index 3281de8c..95205f63 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -115,6 +115,16 @@ public class CrawlJobExtractorMain { } } + public static void writeSpec(Path outFile, CrawlingSpecification... specs) throws IOException { + Gson gson = GsonFactory.get(); + + try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { + for (var spec : specs) { + out.println(gson.toJson(spec)); + } + } + } + private record DomainWithId(String domainName, int id) { } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java new file mode 100644 index 00000000..b26f501a --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerTestMain.java @@ -0,0 +1,116 @@ +package nu.marginalia.wmsa.edge.crawling; + +import io.github.bucket4j.Bandwidth; +import io.github.bucket4j.Bucket; +import io.github.bucket4j.Refill; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +public class CrawlerTestMain { + + static Bucket rateLimiter60RPM; + static List successfullyFetched = new ArrayList<>(); + + public static void main(String... args) { + var refill = Refill.greedy(1, Duration.ofSeconds(1)); + + var bw = Bandwidth.classic(10, refill); + rateLimiter60RPM = Bucket.builder().addLimit(bw).build(); + + Spark.port(8080); + Spark.before(CrawlerTestMain::before); + Spark.after(CrawlerTestMain::after); + Spark.get("/rate-limit/", CrawlerTestMain::index); + Spark.get("/rate-limit/:n", CrawlerTestMain::n); + + Spark.before("/rate-limit/:n", CrawlerTestMain::rateLimitRequest); + Spark.before("/intermittent-error/:n", CrawlerTestMain::simulateRandomTimeouts); + + Spark.get("/intermittent-error/", CrawlerTestMain::index); + Spark.get("/intermittent-error/:n", CrawlerTestMain::n); + + } + + private static void rateLimitRequest(Request request, Response response) { + if (!rateLimiter60RPM.tryConsume(1)) { + Spark.halt(429); + } + } + + private static void simulateRandomTimeouts(Request request, Response response) { + if (Math.random() < 0.25) { + System.out.println("Simulating error"); + Spark.halt(503); + } + } + + public static void before(Request request, Response response) { + System.out.println(request.pathInfo()); + successfullyFetched.add(request.pathInfo()); + } + public static void after(Request request, Response response) { + if (response.status() < 300) { + successfullyFetched.add(request.pathInfo()); + } + } + + private static Object n(Request request, Response response) { + + int num = Integer.parseInt(request.params("n")); + return """ + + + Index + +

Index

+ """ + + String.format("Next, Next 2", num+1, num+2) + + + + """ + +

+ Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless + sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their + bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. + + Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. + Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an + evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. + He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a + golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the + two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who + live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free + my darling child; show reverence for Zeus’s son, far-striking Apollo.’ + """; + } + + private static Object index(Request request, Response response) { + return """ + + + Index + +

Index

+ Next +

+ Goddess, sing me the anger, of Achilles, Peleus’ son, that fatal anger that brought countless + sorrows on the Greeks, and sent many valiant souls of warriors down to Hades, leaving their + bodies as spoil for dogs and carrion birds: for thus was the will of Zeus brought to fulfilment. + + Sing of it from the moment when Agamemnon, Atreus’ son, that king of men, parted in wrath from noble Achilles. + Which of the gods set these two to quarrel? Apollo, the son of Leto and Zeus, angered by the king, brought an + evil plague on the army, so that the men were dying, for the son of Atreus had dishonoured Chryses the priest. + He it was who came to the swift Achaean ships, to free his daughter, bringing a wealth of ransom, carrying a + golden staff adorned with the ribbons of far-striking Apollo, and called out to the Achaeans, above all to the + two leaders of armies, those sons of Atreus: ‘Atreides, and all you bronze-greaved Achaeans, may the gods who + live on Olympus grant you to sack Priam’s city, and sail back home in safety; but take this ransom, and free + my darling child; show reverence for Zeus’s son, far-striking Apollo.’ + """; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java index d55cd2bb..57298c84 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawlingSpecification.java @@ -1,7 +1,11 @@ package nu.marginalia.wmsa.edge.crawling.model; +import lombok.AllArgsConstructor; +import lombok.NoArgsConstructor; + import java.util.List; +@AllArgsConstructor @NoArgsConstructor public class CrawlingSpecification { public String id; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 14716fbf..ecd85756 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -28,7 +28,7 @@ import static java.lang.Math.max; import static java.lang.Math.min; public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 250); + private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500); private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private final LinkedList queue = new LinkedList<>(); @@ -75,7 +75,7 @@ public class CrawlerRetreiver { if (queue.peek() != null) { var fst = queue.peek(); - var root = fst.domain.toRootUrl(); + var root = fst.withPathAndParam("/", null); if (known.add(root.toString())) queue.addFirst(root); } @@ -117,7 +117,7 @@ public class CrawlerRetreiver { .build()); } - var fetchResult = fetcher.probeDomain(fst.domain.toRootUrl()); + var fetchResult = fetcher.probeDomain(fst.withPathAndParam("/", null)); if (!fetchResult.ok()) { logger.debug("Bad status on {}", domain); return Optional.of(createErrorPostFromStatus(fetchResult)); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index 3b7239c0..d215d66e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -111,7 +111,7 @@ public class HttpFetcher { @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { var head = new Request.Builder().head().addHeader("User-agent", userAgent) - .url(url.domain.toRootUrl().toString()) + .url(url.toString()) .build(); var call = client.newCall(head);