mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Reduce long term memory allocation in DomainCrawlFrontier
(crawler) Reduce long term memory allocation in DomainCrawlFrontier
This commit is contained in:
parent
9ad32ee9c7
commit
7611b7900d
@ -52,6 +52,8 @@ dependencies {
|
|||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.rxjava
|
implementation libs.rxjava
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
@ -1,16 +1,28 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
public class DomainCrawlFrontier {
|
public class DomainCrawlFrontier {
|
||||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
private final ArrayDeque<String> queue;
|
||||||
private final HashSet<String> visited;
|
|
||||||
private final HashSet<String> known;
|
// To save the number of strings kept in memory,
|
||||||
|
// do an approximate check using 64 bit hashes instead
|
||||||
|
// ..
|
||||||
|
// This isn't perfect, and may lead to false positives,
|
||||||
|
// but this is relatively unlikely, since the cardinality of these
|
||||||
|
// need to be in the billions to approach Birthday Paradox
|
||||||
|
// territory
|
||||||
|
private final LongOpenHashSet visited;
|
||||||
|
private final LongOpenHashSet known;
|
||||||
|
private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128();
|
||||||
|
|
||||||
private final EdgeDomain thisDomain;
|
private final EdgeDomain thisDomain;
|
||||||
private final UrlBlocklist urlBlocklist;
|
private final UrlBlocklist urlBlocklist;
|
||||||
@ -24,8 +36,9 @@ public class DomainCrawlFrontier {
|
|||||||
this.urlBlocklist = new UrlBlocklist();
|
this.urlBlocklist = new UrlBlocklist();
|
||||||
this.depth = depth;
|
this.depth = depth;
|
||||||
|
|
||||||
visited = new HashSet<>((int)(urls.size() * 1.5));
|
queue = new ArrayDeque<>(10 + (int) (urls.size()*1.2));
|
||||||
known = new HashSet<>(urls.size() * 10);
|
visited = new LongOpenHashSet(10 + (int)(urls.size() * 1.5));
|
||||||
|
known = new LongOpenHashSet(10 + urls.size() * 2);
|
||||||
|
|
||||||
for (String urlStr : urls) {
|
for (String urlStr : urls) {
|
||||||
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
EdgeUrl.parse(urlStr).ifPresent(this::addToQueue);
|
||||||
@ -48,21 +61,42 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void addFirst(EdgeUrl url) {
|
public void addFirst(EdgeUrl url) {
|
||||||
if (known.add(url.toString())) {
|
if (addKnown(url)) {
|
||||||
queue.addFirst(url);
|
queue.addFirst(url.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeUrl takeNextUrl() {
|
public EdgeUrl takeNextUrl() {
|
||||||
return queue.removeFirst();
|
try {
|
||||||
|
return new EdgeUrl(queue.removeFirst());
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
// This should never happen since we only add urls via EdgeUrl.toString()
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeUrl peek() {
|
public EdgeUrl peek() {
|
||||||
return queue.peek();
|
try {
|
||||||
|
return new EdgeUrl(queue.peek());
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
// This should never happen since we only add urls via EdgeUrl.toString()
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean addVisited(EdgeUrl url) {
|
public boolean addVisited(EdgeUrl url) {
|
||||||
return visited.add(url.toString());
|
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
||||||
|
|
||||||
|
return visited.add(hashCode);
|
||||||
|
}
|
||||||
|
public boolean addKnown(EdgeUrl url) {
|
||||||
|
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
||||||
|
return known.add(hashCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isVisited(EdgeUrl url) {
|
||||||
|
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
||||||
|
return visited.contains(hashCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean filterLink(EdgeUrl url) {
|
public boolean filterLink(EdgeUrl url) {
|
||||||
@ -80,14 +114,14 @@ public class DomainCrawlFrontier {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
// reduce memory usage by not growing queue huge when crawling large sites
|
// reduce memory usage by not growing queue huge when crawling large sites
|
||||||
if (queue.size() + visited.size() >= depth + 1000)
|
if (queue.size() + visited.size() >= depth + 200)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (visited.contains(url.toString()))
|
if (isVisited(url))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (known.add(url.toString())) {
|
if (addKnown(url)) {
|
||||||
queue.addLast(url);
|
queue.addLast(url.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,32 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class DomainCrawlFrontierTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testVisited() throws URISyntaxException {
|
||||||
|
var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100);
|
||||||
|
|
||||||
|
assertTrue(dcf.addVisited(new EdgeUrl("https://example.com")));
|
||||||
|
assertTrue(dcf.isVisited(new EdgeUrl("https://example.com")));
|
||||||
|
assertFalse(dcf.addVisited(new EdgeUrl("https://example.com")));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKnown() throws URISyntaxException {
|
||||||
|
var dcf = new DomainCrawlFrontier(new EdgeDomain("example.com"), Set.of(), 100);
|
||||||
|
|
||||||
|
assertTrue(dcf.addKnown(new EdgeUrl("https://example.com")));
|
||||||
|
assertFalse(dcf.addKnown(new EdgeUrl("https://example.com/")));
|
||||||
|
assertTrue(dcf.addKnown(new EdgeUrl("https://example.com/index.html")));
|
||||||
|
assertFalse(dcf.addKnown(new EdgeUrl("https://example.com")));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user