From a1eb8375a24712ab646fad4ffe8d093eed4ebb51 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 18 Aug 2022 19:05:03 +0200 Subject: [PATCH 1/2] Exclude wp-content/uploads from crawling --- .../nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java | 1 + .../java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java | 1 + 2 files changed, 2 insertions(+) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java index b8064952..40dbaa0d 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java @@ -19,6 +19,7 @@ public class UrlBlocklist { public UrlBlocklist() { // Don't deep-crawl git repos patterns.add(Pattern.compile("\\.git/.+").asPredicate()); + patterns.add(Pattern.compile("wp-content/upload").asPredicate()); // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate()); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java index c93e1ffb..2987fde0 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/UrlBlocklistTest.java @@ -16,6 +16,7 @@ class UrlBlocklistTest { UrlBlocklist blocklist = new UrlBlocklist(); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu/ghc/ghc/blob/1b1067d14b656bbbfa7c47f156ec2700c9751549/compiler/main/UpdateCafInfos.hs"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("https://memex.marginalia.nu//gn/+/d62642c920e6a0d1756316d225a90fd6faa9e21e"))); + assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://www.marginalia.nu/wp-content/uploads/test.jpg"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/pdf/download-a-course-in-algebra.html"))); assertFalse(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/nope/x-a-course-in-algebra.html"))); assertTrue(blocklist.isUrlBlocked(new EdgeUrl("http://yelenasimone.com/_module/slide/pqPan/library/american-sour-beer-innovative-techniques-for-mixed-fermentations/"))); From ede62f2515a8d5937e17599c516ce63abd6c5f47 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Thu, 18 Aug 2022 20:44:44 +0200 Subject: [PATCH 2/2] Retain cookies for domain. --- .../marginalia/wmsa/edge/crawling/retreival/Cookies.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java index 2fde3091..b19478ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/Cookies.java @@ -9,21 +9,22 @@ import java.util.List; import java.util.concurrent.ConcurrentHashMap; public class Cookies { - final ThreadLocal>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new); + final ThreadLocal>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new); public CookieJar getJar() { return new CookieJar() { @Override public void saveFromResponse(HttpUrl url, List cookies) { + if (!cookies.isEmpty()) { - cookieJar.get().put(url, cookies); + cookieJar.get().put(url.host(), cookies); } } @Override public List loadForRequest(HttpUrl url) { - return cookieJar.get().getOrDefault(url, Collections.emptyList()); + return cookieJar.get().getOrDefault(url.host(), Collections.emptyList()); } }; }