From 5ab2a22e8855406ae0c54b7d9ee92eed51a82216 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 17 Dec 2023 13:14:23 +0100 Subject: [PATCH 1/5] (search) Fix result count back down to 1 per domain --- .../main/java/nu/marginalia/search/SearchQueryParamFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java index 11eef2a0..7430f6bb 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -30,7 +30,7 @@ public class SearchQueryParamFactory { profile.getSizeLimit(), SpecificationLimit.none(), List.of(), - new QueryLimits(2, 100, 200, 8192), + new QueryLimits(1, 100, 200, 8192), profile.searchSetIdentifier ); From bcad6492d61766fbcf0a7ea7e99affbf029ae7a9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 17 Dec 2023 13:28:17 +0100 Subject: [PATCH 2/5] (sideloader) Fix integration problems with sideloaders In encyclopedia, add a class "mw-content-text" that the WikiSpecialization class is looking for during pruning to give the articles a more fair treatment. Also add generator keywords based on the generator type provided, to ensure that these documents show up in appropriate filters. Further, add a new document flag value 'Sideloaded' to be able to distinguish these entries. --- .../main/java/nu/marginalia/model/idx/DocumentFlags.java | 2 +- .../converting/sideload/SideloaderProcessing.java | 9 +++++++-- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 9 +++++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java index 0c051341..eb2a61b2 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java +++ b/code/common/model/src/main/java/nu/marginalia/model/idx/DocumentFlags.java @@ -8,7 +8,7 @@ public enum DocumentFlags { GeneratorDocs, GeneratorForum, GeneratorWiki, - Unused6, + Sideloaded, Unused7, Unused8, ; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 16a1ae7c..1a034ac2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -63,6 +63,11 @@ public class SideloaderProcessing { for (String keyword : extraKeywords) ret.words.add(keyword, WordFlags.Subjects.asBit()); + if (type == GeneratorType.WIKI) + ret.words.add("generator:wiki", WordFlags.Subjects.asBit()); + else if (type == GeneratorType.DOCS) + ret.words.add("generator:docs", WordFlags.Subjects.asBit()); + ret.details = details.details(); // Add a few things that we know about the document @@ -80,8 +85,8 @@ public class SideloaderProcessing { PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, switch (type) { - case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki); - case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs); + case WIKI -> EnumSet.of(DocumentFlags.GeneratorWiki, DocumentFlags.Sideloaded); + case DOCS -> EnumSet.of(DocumentFlags.GeneratorDocs, DocumentFlags.Sideloaded); default -> EnumSet.noneOf(DocumentFlags.class); }); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 204aa6a8..8ca80c45 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -110,13 +110,18 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC String fullUrl = baseUrl.toString() + url; StringBuilder fullHtml = new StringBuilder(); - fullHtml.append("").append(title).append(""); + fullHtml + .append("") + .append(title) + .append("") + .append("
"); + for (String part : parts) { fullHtml.append("

"); fullHtml.append(part); fullHtml.append("

"); } - fullHtml.append(""); + fullHtml.append("
"); var doc = sideloaderProcessing .processDocument(fullUrl, From 4801c47273b0670f47afa92bf597fb8adbfb8f88 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 17 Dec 2023 13:53:31 +0100 Subject: [PATCH 3/5] (crawling-model) Fix bug where CrawledDocument.getDomain() trimmed www-prefixes This had the knock-on effect of breaking the anchor tag loading in the processor for a lot of domains, since they'd grab domains for the wrong domain name. --- .../main/java/nu/marginalia/crawling/model/CrawledDocument.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 6b9ba1be..39d9b56e 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -52,7 +52,7 @@ public class CrawledDocument implements SerializableCrawlData { return EdgeUrl .parse(url) .map(EdgeUrl::getDomain) - .map(d -> d.domain) + .map(Object::toString) .orElse(null); } From edf9aa2c23882d5c8768e7961fb9a8145b120fc7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 17 Dec 2023 13:59:54 +0100 Subject: [PATCH 4/5] (*) Rename EdgeDomain$domain into topDomain This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great. --- .../sideload/stackexchange/StackexchangeSideloader.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index bf0344ec..a39bdab8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -115,8 +115,9 @@ public class StackexchangeSideloader implements SideloadSource { ret.words = keywordExtractor.extractKeywords(dld, url); ret.words.addAllSyntheticTerms(List.of( "site:" + domainName, - "site:" + url.domain.domain, - url.domain.domain + "site:" + url.domain.topDomain, + url.domain.topDomain, + domainName )); if (!post.tags().isBlank()) { From bf44805e69fb062cb9275000d879ea7774a10bf4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 17 Dec 2023 14:00:07 +0100 Subject: [PATCH 5/5] (*) Rename EdgeDomain$domain into topDomain This variable had a very confusing name, and was dangerously easy to use in the wrong place with the result of getting something that only works as expected half the time. Ideally this class needs an overhaul, the assumptions it makes about domain names aren't great. --- .../java/nu/marginalia/model/EdgeDomain.java | 50 +++++++++---------- .../nu/marginalia/model/EdgeDomainTest.java | 23 ++++----- .../marginalia/ip_blocklist/IpBlockList.java | 2 +- .../marginalia/ip_blocklist/UrlBlocklist.java | 4 +- .../marginalia/browse/model/BrowseResult.java | 4 +- .../converting/processor/DomainProcessor.java | 5 +- .../AbstractDocumentProcessorPlugin.java | 2 +- .../plugin/HtmlDocumentProcessorPlugin.java | 2 +- .../HtmlProcessorSpecializations.java | 2 +- .../marginalia/crawling/RssCrawlerTest.java | 6 +-- .../loading/domains/DomainLoaderService.java | 2 +- .../app/svc/ControlBlacklistService.java | 2 +- .../actor/task/ExportAtagsActor.java | 2 +- .../svc/DomainListRefreshService.java | 2 +- 14 files changed, 53 insertions(+), 55 deletions(-) diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java index 54038b6a..d60f3571 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeDomain.java @@ -15,7 +15,7 @@ public class EdgeDomain implements Serializable { @Nonnull public final String subDomain; @Nonnull - public final String domain; + public final String topDomain; @SneakyThrows public EdgeDomain(String host) { @@ -27,13 +27,13 @@ public class EdgeDomain implements Serializable { if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> subDomain = ""; - domain = host; + topDomain = host; } else { int dot2 = host.substring(0, dot).lastIndexOf('.'); if (dot2 < 0) { subDomain = ""; - domain = host; + topDomain = host; } else { if (looksLikeGovTld(host)) @@ -42,16 +42,16 @@ public class EdgeDomain implements Serializable { if (dot3 >= 0) { dot2 = dot3; subDomain = host.substring(0, dot2); - domain = host.substring(dot2 + 1); + topDomain = host.substring(dot2 + 1); } else { subDomain = ""; - domain = host; + topDomain = host; } } else { subDomain = host.substring(0, dot2); - domain = host.substring(dot2 + 1); + topDomain = host.substring(dot2 + 1); } } } @@ -97,28 +97,28 @@ public class EdgeDomain implements Serializable { public String getAddress() { if (!subDomain.isEmpty()) { - return subDomain + "." + domain; + return subDomain + "." + topDomain; } - return domain; + return topDomain; } public String getDomainKey() { - int cutPoint = domain.indexOf('.'); + int cutPoint = topDomain.indexOf('.'); if (cutPoint < 0) { - return domain; + return topDomain; } - return domain.substring(0, cutPoint).toLowerCase(); + return topDomain.substring(0, cutPoint).toLowerCase(); } public String getLongDomainKey() { StringBuilder ret = new StringBuilder(); - int cutPoint = domain.indexOf('.'); + int cutPoint = topDomain.indexOf('.'); if (cutPoint < 0) { - ret.append(domain); + ret.append(topDomain); } else { - ret.append(domain, 0, cutPoint); + ret.append(topDomain, 0, cutPoint); } if (!"".equals(subDomain) && !"www".equals(subDomain)) { @@ -133,30 +133,30 @@ public class EdgeDomain implements Serializable { public boolean hasSameTopDomain(EdgeDomain other) { if (other == null) return false; - return domain.equalsIgnoreCase(other.domain); + return topDomain.equalsIgnoreCase(other.topDomain); } public String getTld() { int dot = -1; - int length = domain.length(); + int length = topDomain.length(); - if (ipPatternTest.test(domain)) { + if (ipPatternTest.test(topDomain)) { return "IP"; } - if (govListTest.test(domain)) { - dot = domain.indexOf('.', Math.max(0, length - ".edu.uk".length())); + if (govListTest.test(topDomain)) { + dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length())); } else { - dot = domain.lastIndexOf('.'); + dot = topDomain.lastIndexOf('.'); } - if (dot < 0 || dot == domain.length() - 1) { + if (dot < 0 || dot == topDomain.length() - 1) { return "-"; } else { - return domain.substring(dot + 1); + return topDomain.substring(dot + 1); } } @@ -166,8 +166,8 @@ public class EdgeDomain implements Serializable { final String this$subDomain = this.getSubDomain(); final String other$subDomain = other.getSubDomain(); if (!Objects.equals(this$subDomain,other$subDomain)) return false; - final String this$domain = this.getDomain(); - final String other$domain = other.getDomain(); + final String this$domain = this.getTopDomain(); + final String other$domain = other.getTopDomain(); if (!Objects.equals(this$domain,other$domain)) return false; return true; } @@ -177,7 +177,7 @@ public class EdgeDomain implements Serializable { int result = 1; final Object $subDomain = this.getSubDomain().toLowerCase(); result = result * PRIME + $subDomain.hashCode(); - final Object $domain = this.getDomain().toLowerCase(); + final Object $domain = this.getTopDomain().toLowerCase(); result = result * PRIME + $domain.hashCode(); return result; } diff --git a/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java b/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java index 9fbf6890..ad41b884 100644 --- a/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java +++ b/code/common/model/src/test/java/nu/marginalia/model/EdgeDomainTest.java @@ -1,6 +1,5 @@ package nu.marginalia.model; -import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Test; import java.net.URISyntaxException; @@ -22,7 +21,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://l7072i3.l7c.net"); assertEquals("http", domain.proto); assertEquals("l7072i3", domain.domain.subDomain); - assertEquals("l7c.net", domain.domain.domain); + assertEquals("l7c.net", domain.domain.topDomain); assertEquals("net", domain.domain.getTld()); } @@ -31,7 +30,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://endless.horse/"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("endless.horse", domain.domain.domain); + assertEquals("endless.horse", domain.domain.topDomain); assertEquals("horse", domain.domain.getTld()); } @@ -40,7 +39,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://uj.edu.pl"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("uj.edu.pl", domain.domain.domain); + assertEquals("uj.edu.pl", domain.domain.topDomain); assertEquals("edu.pl", domain.domain.getTld()); } @@ -50,7 +49,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://www.marginalia.nu"); assertEquals("http", domain.proto); assertEquals("www", domain.domain.subDomain); - assertEquals("marginalia.nu", domain.domain.domain); + assertEquals("marginalia.nu", domain.domain.topDomain); assertEquals("http://www.marginalia.nu/", domain.toString()); assertEquals("nu", domain.domain.getTld()); } @@ -58,7 +57,7 @@ class EdgeDomainTest { @Test public void testUkDomain2() throws URISyntaxException { var domain = new EdgeUrl("http://marginalia.co.uk"); - assertEquals("marginalia.co.uk", domain.domain.domain); + assertEquals("marginalia.co.uk", domain.domain.topDomain); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://marginalia.co.uk/", domain.toString()); @@ -68,7 +67,7 @@ class EdgeDomainTest { @Test public void testUkDomain3() throws URISyntaxException { var domain = new EdgeUrl("http://withcandour.co.uk"); - assertEquals("withcandour.co.uk", domain.domain.domain); + assertEquals("withcandour.co.uk", domain.domain.topDomain); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); assertEquals("http://withcandour.co.uk/", domain.toString()); @@ -80,7 +79,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://www.marginalia.co.uk"); assertEquals("http", domain.proto); assertEquals("www", domain.domain.subDomain); - assertEquals("marginalia.co.uk", domain.domain.domain); + assertEquals("marginalia.co.uk", domain.domain.topDomain); assertEquals("http://www.marginalia.co.uk/", domain.toString()); } @@ -88,7 +87,7 @@ class EdgeDomainTest { public void testThreeLetterDomain() throws URISyntaxException { var domain = new EdgeUrl("http://www.marginalia.abcf.de"); assertEquals("http", domain.proto); - assertEquals("abcf.de", domain.domain.domain); + assertEquals("abcf.de", domain.domain.topDomain); assertEquals("www.marginalia", domain.domain.subDomain); assertEquals("de", domain.domain.getTld()); } @@ -98,7 +97,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("http://marginalia.nu"); assertEquals("http", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("marginalia.nu", domain.domain.domain); + assertEquals("marginalia.nu", domain.domain.topDomain); assertEquals("http://marginalia.nu/", domain.toString()); assertEquals("nu", domain.domain.getTld()); } @@ -108,7 +107,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("https://127.0.0.1:8080"); assertEquals("https", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("127.0.0.1", domain.domain.domain); + assertEquals("127.0.0.1", domain.domain.topDomain); assertEquals("https://127.0.0.1:8080/", domain.toString()); assertEquals("IP", domain.domain.getTld()); } @@ -118,7 +117,7 @@ class EdgeDomainTest { var domain = new EdgeUrl("https://192.168.1.32"); assertEquals("https", domain.proto); assertEquals("", domain.domain.subDomain); - assertEquals("192.168.1.32", domain.domain.domain); + assertEquals("192.168.1.32", domain.domain.topDomain); assertEquals("https://192.168.1.32/", domain.toString()); assertEquals("IP", domain.domain.getTld()); } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java index a339b1d4..e1b5beee 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/IpBlockList.java @@ -62,7 +62,7 @@ public class IpBlockList { if (blocklistDisabled) return true; - if (domain.domain.endsWith(".cn")) { + if (domain.topDomain.endsWith(".cn")) { logger.debug("Blocking {} on .cn-end", domain); return false; } diff --git a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java index f3574b87..dbd95d61 100644 --- a/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java +++ b/code/features-crawl/crawl-blocklist/src/main/java/nu/marginalia/ip_blocklist/UrlBlocklist.java @@ -67,7 +67,7 @@ public class UrlBlocklist { public boolean isUrlBlocked(EdgeUrl url) { try { - if (badDomains.contains(url.domain.domain)) { + if (badDomains.contains(url.domain.topDomain)) { return true; } @@ -76,7 +76,7 @@ public class UrlBlocklist { return true; } - if ("github.com".equals(url.domain.domain)) { + if ("github.com".equals(url.domain.topDomain)) { return url.path.chars().filter(c -> c == '/').count() > 2; } diff --git a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java index e4f5460b..bfca3fb5 100644 --- a/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java +++ b/code/features-search/random-websites/src/main/java/nu/marginalia/browse/model/BrowseResult.java @@ -10,7 +10,7 @@ public record BrowseResult (EdgeUrl url, public String domainHash() { var domain = url.domain; if ("www".equals(domain.subDomain)) { - return domain.domain; + return domain.topDomain; } return domain.toString(); } @@ -19,7 +19,7 @@ public record BrowseResult (EdgeUrl url, String ret; var domain = url.domain; if ("www".equals(domain.subDomain)) { - ret = domain.domain; + ret = domain.topDomain; } else { ret = domain.toString(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index e9794aad..2f838cb5 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -14,7 +14,6 @@ import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.model.crawl.HtmlFeature; @@ -161,10 +160,10 @@ public class DomainProcessor { private static final Pattern academicPattern = Pattern.compile(".*\\.(ac|edu)\\.[a-z]{2}$"); private boolean isAcademicDomain(EdgeDomain domain) { - if (domain.domain.endsWith(".edu")) + if (domain.topDomain.endsWith(".edu")) return true; - if (academicPattern.matcher(domain.domain).matches()) + if (academicPattern.matcher(domain.topDomain).matches()) return true; return false; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 0007eeb6..a92a4af7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -50,7 +50,7 @@ public abstract class AbstractDocumentProcessorPlugin { public MetaTagsBuilder addUrl(EdgeUrl url) { add("proto", url.proto); add("site", url.domain); - add("site", url.domain.domain); + add("site", url.domain.topDomain); add("tld", url.domain.getTld()); if (url.path.startsWith("/~")) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 4017778e..7d973909 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -291,7 +291,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin for (var fd : lp.getForeignDomains()) { linkTerms.add("links:"+fd.toString().toLowerCase()); - linkTerms.add("links:"+fd.getDomain().toLowerCase()); + linkTerms.add("links:"+fd.getTopDomain().toLowerCase()); } return linkTerms; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java index c587ce4b..80b063e8 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/specialization/HtmlProcessorSpecializations.java @@ -54,7 +54,7 @@ public class HtmlProcessorSpecializations { return blogSpecialization; } - if (url.domain.getDomain().equals("mariadb.com") + if (url.domain.getTopDomain().equals("mariadb.com") && url.path.startsWith("/kb")) { return mariadbKbSpecialization; } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java index 05de76dc..a4d0ee92 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/RssCrawlerTest.java @@ -33,7 +33,7 @@ class RssCrawlerTest { var href = element.attr("href"); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); @@ -42,7 +42,7 @@ class RssCrawlerTest { var href = element.text(); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); @@ -51,7 +51,7 @@ class RssCrawlerTest { var href = element.text(); if (href != null && !href.isBlank()) { lp.parseLink(base, href) - .filter(u -> Objects.equals(u.domain.domain, base.domain.domain)) + .filter(u -> Objects.equals(u.domain.topDomain, base.domain.topDomain)) .ifPresent(urls::add); } }); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java index 911c976d..7125eb30 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -147,7 +147,7 @@ public class DomainLoaderService { public void accept(EdgeDomain domain) throws SQLException { statement.setString(1, domain.toString()); - statement.setString(2, domain.domain); + statement.setString(2, domain.topDomain); statement.setInt(3, nodeAffinity); statement.addBatch(); diff --git a/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java b/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java index 9181f325..3a1ad44d 100644 --- a/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java +++ b/code/services-core/control-service/src/main/java/nu/marginalia/control/app/svc/ControlBlacklistService.java @@ -81,7 +81,7 @@ public class ControlBlacklistService { """)) { stmt.setString(1, domain.toString()); stmt.addBatch(); - stmt.setString(1, domain.domain); + stmt.setString(1, domain.topDomain); stmt.addBatch(); stmt.executeBatch(); } diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 353ef965..35ddde89 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -157,7 +157,7 @@ public class ExportAtagsActor extends RecordActorPrototype { return false; // This is an artifact of the link parser typically - if ("example.com".equals(url.domain.domain)) + if ("example.com".equals(url.domain.topDomain)) return false; if (linkText.contains(url.domain.toString())) diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java index fe600fc0..a25bb92f 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/svc/DomainListRefreshService.java @@ -61,7 +61,7 @@ public class DomainListRefreshService { for (var domain : domainsAll) { var parsed = new EdgeDomain(domain); insert.setString(1, domain.toLowerCase()); - insert.setString(2, parsed.domain); + insert.setString(2, parsed.topDomain); insert.setInt(3, nodeId); insert.addBatch(); }