From 89dd201a7b507a627320dedcf4f3aeb172b2aeb7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 15 Oct 2024 13:48:32 +0200 Subject: [PATCH] (link-parser) Make mailing list blocking optional --- .../java/nu/marginalia/ip_blocklist/UrlBlocklist.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java index dbd95d61..6150915f 100644 --- a/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java +++ b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java @@ -1,7 +1,7 @@ package nu.marginalia.ip_blocklist; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.gregex.GuardedRegexFactory; +import nu.marginalia.model.EdgeUrl; import java.util.ArrayList; import java.util.List; @@ -16,6 +16,8 @@ public class UrlBlocklist { "instagram.com", "youtube.com", "youtu.be", "amzn.to"); + private static final boolean BLOCK_MAILING_LISTS = Boolean.getBoolean("links.block_mailing_lists"); + public UrlBlocklist() { // Don't deep-crawl git repos patterns.add(s -> s.contains(".git/")); @@ -92,6 +94,10 @@ public class UrlBlocklist { } public boolean isMailingListLink(EdgeUrl linkUrl) { + if (!BLOCK_MAILING_LISTS) { + return false; + } + var path = linkUrl.path; if (path.startsWith("/lists/")) { return true;