From 52bc0272f8407a4b729cd34f6c1594a9c0eb1f86 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 27 Nov 2024 14:26:44 +0100 Subject: [PATCH] (atag) Add alias domain support and improve domain handling Introduced optional alias domain functionality in EdgeDomain class to handle domain variations such as "www" in the anchor tags code, as there are commonly a number of relevant but glancing misses in the atags data. --- .../java/nu/marginalia/model/EdgeDomain.java | 13 +++++++++++ .../atags/source/AnchorTagsImpl.java | 23 +++++++++++++++++-- .../atags/source/AnchorTagsSourceFactory.java | 9 +++++++- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/code/common/model/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/java/nu/marginalia/model/EdgeDomain.java index c6900a7c..a2855e61 100644 --- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java @@ -3,6 +3,7 @@ package nu.marginalia.model; import javax.annotation.Nonnull; import java.io.Serializable; import java.util.Objects; +import java.util.Optional; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -133,6 +134,18 @@ public class EdgeDomain implements Serializable { return ret.toString().toLowerCase(); } + /** If possible, try to provide an alias domain, + * i.e. a domain name that is very likely to link to this one + * */ + public Optional aliasDomain() { + if (subDomain.equals("www")) { + return Optional.of(new EdgeDomain("", topDomain)); + } else if (subDomain.isBlank()){ + return Optional.of(new EdgeDomain("www", topDomain)); + } + else return Optional.empty(); + } + public boolean hasSameTopDomain(EdgeDomain other) { if (other == null) return false; diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index 7ff09289..c80a57c7 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -12,13 +12,16 @@ import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Optional; public class AnchorTagsImpl implements AnchorTagsSource { private final Connection duckdbConnection; private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class); + public AnchorTagsImpl(Path atagsPath, - List relevantDomains) + Collection relevantDomains) throws SQLException { duckdbConnection = DriverManager.getConnection("jdbc:duckdb:"); @@ -82,14 +85,30 @@ public class AnchorTagsImpl implements AnchorTagsSource { where dest = ? """)) { + // Add links to the provided domain ps.setString(1, domain.toString()); var rs = ps.executeQuery(); while (rs.next()) { links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); } + + // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? + Optional aliasDomain = domain.aliasDomain(); + if (aliasDomain.isPresent()) { + ps.setString(1, aliasDomain.get().toString()); + rs = ps.executeQuery(); + while (rs.next()) { + // Change the domain name in the URL to the alias domain + String url = rs.getString("url"); + url = aliasDomain + url.substring(url.indexOf('/')); + + links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); + } + return new DomainLinks(links); + } return new DomainLinks(links); } - catch (SQLException ex) { + catch (Exception ex) { logger.warn("Failed to get atags for " + domain, ex); } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java index b0da1148..ca805289 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java @@ -13,7 +13,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; public class AnchorTagsSourceFactory { private final Path atagsPath; @@ -54,7 +56,12 @@ public class AnchorTagsSourceFactory { return domain -> new DomainLinks(); } - return new AnchorTagsImpl(atagsPath, relevantDomains); + Set allDomains = new HashSet<>(relevantDomains); + for (var domain : relevantDomains) { + domain.aliasDomain().ifPresent(allDomains::add); + } + + return new AnchorTagsImpl(atagsPath, allDomains); } // Only get domains that are assigned to this node. This reduces the amount of data