mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Switch hash function in crawler
Guava's hashers are a bit allocation hungry, and a big driver of GC churn in the crawler. This switches to the modified Murmur hash function used throughout Marginalia.
This commit is contained in:
parent
3ea1ddae22
commit
9e5fe71f5b
@ -42,6 +42,7 @@ dependencies {
|
|||||||
implementation project(':code:features-crawl:crawl-blocklist')
|
implementation project(':code:features-crawl:crawl-blocklist')
|
||||||
implementation project(':code:features-crawl:link-parser')
|
implementation project(':code:features-crawl:link-parser')
|
||||||
implementation project(':code:features-crawl:content-type')
|
implementation project(':code:features-crawl:content-type')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import com.google.common.hash.HashFunction;
|
|
||||||
import com.google.common.hash.Hashing;
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
@ -19,7 +17,6 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import com.google.common.hash.HashFunction;
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -16,6 +16,7 @@ public class DomainCrawlFrontier {
|
|||||||
|
|
||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
|
|
||||||
|
private static final MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
private final ArrayDeque<String> queue;
|
private final ArrayDeque<String> queue;
|
||||||
|
|
||||||
// To save the number of strings kept in memory,
|
// To save the number of strings kept in memory,
|
||||||
@ -27,7 +28,6 @@ public class DomainCrawlFrontier {
|
|||||||
// territory
|
// territory
|
||||||
private final LongOpenHashSet visited;
|
private final LongOpenHashSet visited;
|
||||||
private final LongOpenHashSet known;
|
private final LongOpenHashSet known;
|
||||||
private final HashFunction hasher = com.google.common.hash.Hashing.murmur3_128();
|
|
||||||
|
|
||||||
private final EdgeDomain thisDomain;
|
private final EdgeDomain thisDomain;
|
||||||
private final UrlBlocklist urlBlocklist;
|
private final UrlBlocklist urlBlocklist;
|
||||||
@ -98,17 +98,17 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean addVisited(EdgeUrl url) {
|
public boolean addVisited(EdgeUrl url) {
|
||||||
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
long hashCode = hasher.hashNearlyASCII(url.toString());
|
||||||
|
|
||||||
return visited.add(hashCode);
|
return visited.add(hashCode);
|
||||||
}
|
}
|
||||||
public boolean addKnown(EdgeUrl url) {
|
public boolean addKnown(EdgeUrl url) {
|
||||||
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
long hashCode = hasher.hashNearlyASCII(url.toString());
|
||||||
return known.add(hashCode);
|
return known.add(hashCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isVisited(EdgeUrl url) {
|
public boolean isVisited(EdgeUrl url) {
|
||||||
long hashCode = hasher.hashUnencodedChars(url.toString()).padToLong();
|
long hashCode = hasher.hashNearlyASCII(url.toString());
|
||||||
return visited.contains(hashCode);
|
return visited.contains(hashCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user