From f655ec5a5c996abe77bdad6e7e043571db361792 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 10 Dec 2023 17:30:43 +0100 Subject: [PATCH] (*) Refactor GeoIP-related code In this commit, GeoIP-related classes are refactored and relocated to a common library as they are shared across multiple services. The crawler is refactored to enable the GeoIpBlocklist to use the new GeoIpDictionary as the base of its decisions. The converter is modified ot query this data to add a geoip:-keyword to documents to permit limiting a search to the country of the hosting server. The commit also adds due BY-SA attribution in the search engine footer for the source of the IP geolocation data. --- .../crawl-blocklist/build.gradle | 1 + .../ip_blocklist/GeoIpBlocklist.java | 66 ++++--------------- .../ip_blocklist/InetAddressCache.java | 2 +- code/libraries/geo-ip/build.gradle | 24 +++++++ code/libraries/geo-ip/readme.md | 6 ++ .../nu/marginalia/geoip}/GeoIpDictionary.java | 28 +++++--- .../model/processed/DomainRecord.java | 1 - .../model/processed/DomainWithIp.java | 9 +++ .../processes/converting-process/build.gradle | 1 + .../converting/processor/DomainProcessor.java | 15 ++++- ...CrawlingThenConvertingIntegrationTest.java | 3 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 7 +- .../crawl/retreival/CrawlerRetreiver.java | 4 +- .../crawl/retreival/DomainProber.java | 24 ++++--- .../retreival/fetcher/HttpFetcherImpl.java | 6 +- .../retreival/CrawlerMockFetcherTest.java | 7 +- .../retreival/CrawlerRetreiverTest.java | 9 +-- .../templates/search/parts/search-footer.hdb | 8 ++- .../assistant-service/build.gradle | 1 + .../domains/DomainInformationService.java | 3 +- settings.gradle | 1 + 21 files changed, 135 insertions(+), 91 deletions(-) create mode 100644 code/libraries/geo-ip/build.gradle create mode 100644 code/libraries/geo-ip/readme.md rename code/{services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains => libraries/geo-ip/src/main/java/nu/marginalia/geoip}/GeoIpDictionary.java (80%) diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index c131e97b..8288aa0c 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:guarded-regex') + implementation project(':code:libraries:geo-ip') implementation libs.notnull diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java index ba896317..79ca6847 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java @@ -1,73 +1,31 @@ package nu.marginalia.ip_blocklist; +import com.google.inject.Inject; import com.google.inject.Singleton; -import com.opencsv.CSVReader; -import com.opencsv.exceptions.CsvValidationException; -import lombok.AllArgsConstructor; -import nu.marginalia.WmsaHome; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileReader; -import java.io.IOException; -import java.net.InetAddress; import java.util.Set; -import java.util.TreeMap; @Singleton public class GeoIpBlocklist { - private final TreeMap ranges = new TreeMap<>(); - + /** These countries are extremely overrepresented among the problematic and spammy domains, + * and blocking them is by far the most effective spam mitigation technique. Sucks we throw + * babies out with the bathwater, but it's undeniably effective. + */ private final Set blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + private final GeoIpDictionary ipDictionary; - public GeoIpBlocklist() throws IOException, CsvValidationException { - var resource = WmsaHome.getIPLocationDatabse(); - - try (var reader = new CSVReader(new FileReader(resource.toFile()))) { - for (;;) { - String[] vals = reader.readNext(); - if (vals == null) { - break; - } - if (!(blacklist.contains(vals[2]) || graylist.contains(vals[2]))) { - continue; - } - var range = new GeoIpBlocklist.IpRange(Long.parseLong(vals[0]), - Long.parseLong(vals[1]), - vals[2]); - ranges.put(range.from, range); - } - } - - logger.info("Loaded {} IP ranges", ranges.size()); - } - - public String getCountry(InetAddress address) { - byte[] bytes = address.getAddress(); - long ival = ((long)bytes[0]&0xFF) << 24 | ((long)bytes[1]&0xFF) << 16 | ((long)bytes[2]&0xFF)<< 8 | ((long)bytes[3]&0xFF); - - Long key = ranges.floorKey(ival); - if (null == key) { - return "-"; - } - - var range = ranges.get(key); - if (ival >= key && ival < range.to) { - return range.country; - } - - return "-"; + @Inject + public GeoIpBlocklist(GeoIpDictionary ipDictionary) { + this.ipDictionary = ipDictionary; + ipDictionary.waitReady(); } public boolean isAllowed(EdgeDomain domain) { @@ -85,7 +43,7 @@ public class GeoIpBlocklist { public String getCountry(EdgeDomain domain) { try { - return getCountry(InetAddressCache.getAddress(domain)); + return ipDictionary.getCountry(InetAddressCache.getAddress(domain)); } catch (Throwable ex) { logger.debug("Failed to resolve {}", domain); diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java index 728a1f65..ba9a7948 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/InetAddressCache.java @@ -11,7 +11,7 @@ import java.util.concurrent.TimeUnit; // We don't want to torture the DNS by resolving the same links over and over and over again public class InetAddressCache { - private static final Cache cache = CacheBuilder.newBuilder().maximumSize(10_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); + private static final Cache cache = CacheBuilder.newBuilder().maximumSize(1_000_000).expireAfterAccess(1, TimeUnit.HOURS).build(); public static InetAddress getAddress(EdgeDomain domain) throws Throwable { try { return cache.get(domain, ()-> InetAddress.getByName(domain.getAddress())); diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle new file mode 100644 index 00000000..b0180ef8 --- /dev/null +++ b/code/libraries/geo-ip/build.gradle @@ -0,0 +1,24 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:config') + + implementation libs.bundles.slf4j + implementation libs.opencsv + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + +test { + useJUnitPlatform() +} diff --git a/code/libraries/geo-ip/readme.md b/code/libraries/geo-ip/readme.md new file mode 100644 index 00000000..2a81b04b --- /dev/null +++ b/code/libraries/geo-ip/readme.md @@ -0,0 +1,6 @@ +This micro library handles the GeoIP lookups, mappings from IP addresses +to country codes. + +It uses the free ip2location lite database, which is +available from [https://lite.ip2location.com/database/ip-country](https://lite.ip2location.com/database/ip-country) +under a CC-BY-SA 4.0 license. \ No newline at end of file diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java similarity index 80% rename from code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java rename to code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index e250761e..83789905 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -1,7 +1,6 @@ -package nu.marginalia.assistant.domains; +package nu.marginalia.geoip; import com.opencsv.CSVReader; -import lombok.AllArgsConstructor; import nu.marginalia.WmsaHome; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -14,12 +13,7 @@ public class GeoIpDictionary { private volatile TreeMap ranges = null; private static final Logger logger = LoggerFactory.getLogger(GeoIpDictionary.class); - @AllArgsConstructor - static class IpRange { - public final long from; - public final long to; - public final String country; - } + record IpRange(long from, long to, String country) {} public GeoIpDictionary() { Thread.ofPlatform().start(() -> { @@ -39,10 +33,28 @@ public class GeoIpDictionary { ranges = dict; logger.info("Loaded {} IP ranges", ranges.size()); } catch (Exception e) { + ranges = new TreeMap<>(); throw new RuntimeException(e); } + finally { + this.notifyAll(); + } }); + } + public boolean isReady() { + return null != ranges; + } + + public boolean waitReady() { + while (null == ranges) { + try { + this.wait(); + } catch (InterruptedException e) { + return false; + } + } + return true; } public String getCountry(String ip) { diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java index 6b3491bf..b696829f 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainRecord.java @@ -8,7 +8,6 @@ import org.apache.parquet.schema.*; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; -import java.sql.Array; import java.util.ArrayList; import java.util.List; diff --git a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java index bedae4d5..3782b1b2 100644 --- a/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java +++ b/code/process-models/processed-data/src/main/java/nu/marginalia/model/processed/DomainWithIp.java @@ -1,5 +1,14 @@ package nu.marginalia.model.processed; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString public class DomainWithIp { public String domain; public String ip; diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index faa952fb..979260df 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:big-string') implementation project(':code:libraries:language-processing') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 00a05257..df682d77 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.*; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; @@ -30,6 +31,7 @@ public class DomainProcessor { private final AnchorTagsSource anchorTagsSource; private final AnchorTextKeywords anchorTextKeywords; private final LshDocumentDeduplicator documentDeduplicator; + private final GeoIpDictionary geoIpDictionary; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -38,17 +40,21 @@ public class DomainProcessor { SiteWords siteWords, AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, - LshDocumentDeduplicator documentDeduplicator) throws SQLException + LshDocumentDeduplicator documentDeduplicator, GeoIpDictionary geoIpDictionary) throws SQLException { this.documentProcessor = documentProcessor; this.siteWords = siteWords; this.anchorTextKeywords = anchorTextKeywords; this.documentDeduplicator = documentDeduplicator; this.anchorTagsSource = anchorTagsSourceFactory.create(); + this.geoIpDictionary = geoIpDictionary; + } @SneakyThrows public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + geoIpDictionary.waitReady(); + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); @@ -107,7 +113,14 @@ public class DomainProcessor { // Add late keywords and features from domain-level information List terms = new ArrayList<>(); + terms.add("ip:"+ip); + + String geoIp = geoIpDictionary.getCountry(ip); + if (!geoIp.isBlank()) { + terms.add("geoip:"+geoIp.toLowerCase()); + } + if (cookies) { terms.add(HtmlFeature.COOKIES.getKeyword()); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 58d8a486..7ef056d2 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -6,6 +6,7 @@ import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.SerializableCrawlDataStream; @@ -75,7 +76,7 @@ public class CrawlingThenConvertingIntegrationTest { private CrawledDomain crawl(CrawlSpecRecord specs) { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index dc76abde..f824d815 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -11,6 +11,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; @@ -56,6 +57,7 @@ public class CrawlerMain { private final UserAgent userAgent; private final MessageQueueFactory messageQueueFactory; + private final DomainProber domainProber; private final FileStorageService fileStorageService; private final DbCrawlSpecProvider dbCrawlSpecProvider; private final AnchorTagsSourceFactory anchorTagsSourceFactory; @@ -75,7 +77,7 @@ public class CrawlerMain { @Inject public CrawlerMain(UserAgent userAgent, ProcessHeartbeatImpl heartbeat, - MessageQueueFactory messageQueueFactory, + MessageQueueFactory messageQueueFactory, DomainProber domainProber, FileStorageService fileStorageService, ProcessConfiguration processConfiguration, DbCrawlSpecProvider dbCrawlSpecProvider, @@ -84,6 +86,7 @@ public class CrawlerMain { this.heartbeat = heartbeat; this.userAgent = userAgent; this.messageQueueFactory = messageQueueFactory; + this.domainProber = domainProber; this.fileStorageService = fileStorageService; this.dbCrawlSpecProvider = dbCrawlSpecProvider; this.anchorTagsSourceFactory = anchorTagsSourceFactory; @@ -219,7 +222,7 @@ public class CrawlerMain { var domainLinks = anchorTagsSource.getAnchorTags(domain); - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept); int size = retreiver.fetch(domainLinks, reference); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ce5ecb89..b32e0b6c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -42,7 +42,7 @@ public class CrawlerRetreiver { private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); - private static final DomainProber domainProber = new DomainProber(); + private final DomainProber domainProber; private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; @@ -55,9 +55,11 @@ public class CrawlerRetreiver { private static final String documentWasSameTag = "SAME-BY-COMPARISON"; public CrawlerRetreiver(HttpFetcher fetcher, + DomainProber domainProber, CrawlSpecRecord specs, Consumer writer) { this.fetcher = fetcher; + this.domainProber = domainProber; domain = specs.domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java index 67f006d4..fcc005a8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -1,5 +1,7 @@ package nu.marginalia.crawl.retreival; +import com.google.inject.Inject; +import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.model.CrawlerDomainStatus; @@ -11,17 +13,21 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.util.function.Predicate; +@Singleton public class DomainProber { private final Logger logger = LoggerFactory.getLogger(DomainProber.class); - private static IpBlockList ipBlockList; + private final Predicate domainBlacklist; - static { - try { - ipBlockList = new IpBlockList(new GeoIpBlocklist()); - } catch (Exception e) { - throw new RuntimeException(e); - } + @Inject + public DomainProber(IpBlockList ipBlockList) { + this.domainBlacklist = ipBlockList::isAllowed; + } + + /** For testing */ + public DomainProber(Predicate domainBlacklist) { + this.domainBlacklist = domainBlacklist; } /** To detect problems early we do a probing request to the domain before we start crawling it properly. @@ -37,7 +43,7 @@ public class DomainProber { return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); } - if (!ipBlockList.isAllowed(firstUrlInQueue.domain)) + if (!domainBlacklist.test(firstUrlInQueue.domain)) return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); @@ -62,7 +68,7 @@ public class DomainProber { /** This domain redirects to another domain */ public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {} - /** If the retreivala of the probed url was successful, return the url as it was fetched + /** If the retrieval of the probed url was successful, return the url as it was fetched * (which may be different from the url we probed, if we attempted another URL schema). * * @param probedUrl The url we successfully probed diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 041ae08d..5720ef34 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -15,7 +15,6 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.crawl.retreival.logic.ContentTypeParser; import okhttp3.*; -import org.apache.commons.collections4.queue.PredicatedQueue; import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -87,7 +86,10 @@ public class HttpFetcherImpl implements HttpFetcher { } @Inject - public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { + public HttpFetcherImpl(@Named("user-agent") String userAgent, + Dispatcher dispatcher, + ConnectionPool connectionPool) + { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index c0df397f..b65e5ae6 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -3,6 +3,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -68,7 +69,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -80,7 +81,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); @@ -94,7 +95,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) .fetch(); out.forEach(System.out::println); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 147aca68..e7742445 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawling.io.CrawledDomainReader; @@ -53,7 +54,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); var fetchedUrls = data.stream().filter(CrawledDocument.class::isInstance) @@ -82,7 +83,7 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) @@ -118,7 +119,7 @@ class CrawlerRetreiverTest { var writer = new CrawledDomainWriter(out, specs.domain, "idid"); Map, List> data = new HashMap<>(); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); @@ -136,7 +137,7 @@ class CrawlerRetreiverTest { CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - new CrawlerRetreiver(httpFetcher, specs, d -> { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { if (d instanceof CrawledDocument doc) { System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); } diff --git a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb index f911d3db..88b6ad84 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/parts/search-footer.hdb @@ -99,7 +99,6 @@ + diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index e2c792fb..8609903d 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -33,6 +33,7 @@ dependencies { implementation project(':code:features-search:screenshots') + implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java index 4da309dc..690509db 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/domains/DomainInformationService.java @@ -1,9 +1,8 @@ package nu.marginalia.assistant.domains; import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; +import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.assistant.client.model.DomainInformation; import org.slf4j.Logger; diff --git a/settings.gradle b/settings.gradle index 952acd9c..342107de 100644 --- a/settings.gradle +++ b/settings.gradle @@ -12,6 +12,7 @@ include 'code:services-application:dating-service' include 'code:services-application:explorer-service' include 'code:libraries:array' +include 'code:libraries:geo-ip' include 'code:libraries:btree' include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex'