diff --git a/code/common/model/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/java/nu/marginalia/model/EdgeDomain.java index c6900a7c..a2855e61 100644 --- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java @@ -3,6 +3,7 @@ package nu.marginalia.model; import javax.annotation.Nonnull; import java.io.Serializable; import java.util.Objects; +import java.util.Optional; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -133,6 +134,18 @@ public class EdgeDomain implements Serializable { return ret.toString().toLowerCase(); } + /** If possible, try to provide an alias domain, + * i.e. a domain name that is very likely to link to this one + * */ + public Optional aliasDomain() { + if (subDomain.equals("www")) { + return Optional.of(new EdgeDomain("", topDomain)); + } else if (subDomain.isBlank()){ + return Optional.of(new EdgeDomain("www", topDomain)); + } + else return Optional.empty(); + } + public boolean hasSameTopDomain(EdgeDomain other) { if (other == null) return false; diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java index 7ff09289..c80a57c7 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java @@ -12,13 +12,16 @@ import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Optional; public class AnchorTagsImpl implements AnchorTagsSource { private final Connection duckdbConnection; private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class); + public AnchorTagsImpl(Path atagsPath, - List relevantDomains) + Collection relevantDomains) throws SQLException { duckdbConnection = DriverManager.getConnection("jdbc:duckdb:"); @@ -82,14 +85,30 @@ public class AnchorTagsImpl implements AnchorTagsSource { where dest = ? """)) { + // Add links to the provided domain ps.setString(1, domain.toString()); var rs = ps.executeQuery(); while (rs.next()) { links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source"))); } + + // Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu? + Optional aliasDomain = domain.aliasDomain(); + if (aliasDomain.isPresent()) { + ps.setString(1, aliasDomain.get().toString()); + rs = ps.executeQuery(); + while (rs.next()) { + // Change the domain name in the URL to the alias domain + String url = rs.getString("url"); + url = aliasDomain + url.substring(url.indexOf('/')); + + links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source"))); + } + return new DomainLinks(links); + } return new DomainLinks(links); } - catch (SQLException ex) { + catch (Exception ex) { logger.warn("Failed to get atags for " + domain, ex); } diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java index b0da1148..ca805289 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java @@ -13,7 +13,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; public class AnchorTagsSourceFactory { private final Path atagsPath; @@ -54,7 +56,12 @@ public class AnchorTagsSourceFactory { return domain -> new DomainLinks(); } - return new AnchorTagsImpl(atagsPath, relevantDomains); + Set allDomains = new HashSet<>(relevantDomains); + for (var domain : relevantDomains) { + domain.aliasDomain().ifPresent(allDomains::add); + } + + return new AnchorTagsImpl(atagsPath, allDomains); } // Only get domains that are assigned to this node. This reduces the amount of data