From 7611b7900d8595321735f54c6b118f0e8b8785d8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 29 Jul 2023 19:16:31 +0200 Subject: [PATCH] (crawler) Reduce long term memory allocation in DomainCrawlFrontier (crawler) Reduce long term memory allocation in DomainCrawlFrontier --- code/processes/crawling-process/build.gradle | 2 + .../crawl/retreival/DomainCrawlFrontier.java | 62 ++++++++++++++----- .../retreival/DomainCrawlFrontierTest.java | 32 ++++++++++ 3 files changed, 82 insertions(+), 14 deletions(-) create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 48068620..fcc7862d 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -52,6 +52,8 @@ dependencies { implementation libs.jsoup implementation libs.opencsv implementation libs.rxjava + implementation libs.fastutil + implementation libs.bundles.mariadb testImplementation libs.bundles.slf4j.test diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 4b9cc265..4b1b9ad1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -1,16 +1,28 @@ package nu.marginalia.crawl.retreival; +import com.google.common.hash.HashFunction; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import java.net.URISyntaxException; import java.util.*; import java.util.function.Predicate; public class DomainCrawlFrontier { - private final LinkedList queue = new LinkedList<>(); - private final HashSet visited; - private final HashSet known; + private final ArrayDeque queue; + + // To save the number of strings kept in memory, + // do an approximate check using 64 bit hashes instead + // .. + // This isn't perfect, and may lead to false positives, + // but this is relatively unlikely, since the cardinality of these + // need to be in the billions to approach Birthday Paradox + // territory + private final LongOpenHashSet visited; + private final LongOpenHashSet known; + private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128(); private final EdgeDomain thisDomain; private final UrlBlocklist urlBlocklist; @@ -24,8 +36,9 @@ public class DomainCrawlFrontier { this.urlBlocklist = new UrlBlocklist(); this.depth = depth; - visited = new HashSet<>((int)(urls.size() * 1.5)); - known = new HashSet<>(urls.size() * 10); + queue = new ArrayDeque<>(10 + (int) (urls.size()*1.2)); + visited = new LongOpenHashSet(10 + (int)(urls.size() * 1.5)); + known = new LongOpenHashSet(10 + urls.size() * 2); for (String urlStr : urls) { EdgeUrl.parse(urlStr).ifPresent(this::addToQueue); @@ -48,21 +61,42 @@ public class DomainCrawlFrontier { } public void addFirst(EdgeUrl url) { - if (known.add(url.toString())) { - queue.addFirst(url); + if (addKnown(url)) { + queue.addFirst(url.toString()); } } public EdgeUrl takeNextUrl() { - return queue.removeFirst(); + try { + return new EdgeUrl(queue.removeFirst()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public EdgeUrl peek() { - return queue.peek(); + try { + return new EdgeUrl(queue.peek()); + } catch (URISyntaxException e) { + // This should never happen since we only add urls via EdgeUrl.toString() + throw new RuntimeException(e); + } } public boolean addVisited(EdgeUrl url) { - return visited.add(url.toString()); + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + + return visited.add(hashCode); + } + public boolean addKnown(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return known.add(hashCode); + } + + boolean isVisited(EdgeUrl url) { + long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong(); + return visited.contains(hashCode); } public boolean filterLink(EdgeUrl url) { @@ -80,14 +114,14 @@ public class DomainCrawlFrontier { return; // reduce memory usage by not growing queue huge when crawling large sites - if (queue.size() + visited.size() >= depth + 1000) + if (queue.size() + visited.size() >= depth + 200) return; - if (visited.contains(url.toString())) + if (isVisited(url)) return; - if (known.add(url.toString())) { - queue.addLast(url); + if (addKnown(url)) { + queue.addLast(url.toString()); } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java new file mode 100644 index 00000000..1396444b --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/DomainCrawlFrontierTest.java @@ -0,0 +1,32 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class DomainCrawlFrontierTest { + + @Test + public void testVisited() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addVisited(new EdgeUrl("https://example.com"))); + assertTrue(dcf.isVisited(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addVisited(new EdgeUrl("https://example.com"))); + } + + @Test + public void testKnown() throws URISyntaxException { + var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100); + + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com/"))); + assertTrue(dcf.addKnown(new EdgeUrl("https://example.com/index.html"))); + assertFalse(dcf.addKnown(new EdgeUrl("https://example.com"))); + } +} \ No newline at end of file