(atag) Add alias domain support and improve domain handling

Introduced optional alias domain functionality in EdgeDomain class to handle domain variations such as "www" in the anchor tags code, as there are commonly a number of relevant but glancing misses in the atags data.
This commit is contained in:
Viktor Lofgren 2024-11-27 14:26:44 +01:00
parent d4bce13a03
commit 52bc0272f8
3 changed files with 42 additions and 3 deletions

View File

@ -3,6 +3,7 @@ package nu.marginalia.model;
import javax.annotation.Nonnull;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@ -133,6 +134,18 @@ public class EdgeDomain implements Serializable {
return ret.toString().toLowerCase();
}
/** If possible, try to provide an alias domain,
* i.e. a domain name that is very likely to link to this one
* */
public Optional<EdgeDomain> aliasDomain() {
if (subDomain.equals("www")) {
return Optional.of(new EdgeDomain("", topDomain));
} else if (subDomain.isBlank()){
return Optional.of(new EdgeDomain("www", topDomain));
}
else return Optional.empty();
}
public boolean hasSameTopDomain(EdgeDomain other) {
if (other == null) return false;

View File

@ -12,13 +12,16 @@ import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
public class AnchorTagsImpl implements AnchorTagsSource {
private final Connection duckdbConnection;
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class);
public AnchorTagsImpl(Path atagsPath,
List<EdgeDomain> relevantDomains)
Collection<EdgeDomain> relevantDomains)
throws SQLException
{
duckdbConnection = DriverManager.getConnection("jdbc:duckdb:");
@ -82,14 +85,30 @@ public class AnchorTagsImpl implements AnchorTagsSource {
where dest = ?
"""))
{
// Add links to the provided domain
ps.setString(1, domain.toString());
var rs = ps.executeQuery();
while (rs.next()) {
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
}
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
Optional<EdgeDomain> aliasDomain = domain.aliasDomain();
if (aliasDomain.isPresent()) {
ps.setString(1, aliasDomain.get().toString());
rs = ps.executeQuery();
while (rs.next()) {
// Change the domain name in the URL to the alias domain
String url = rs.getString("url");
url = aliasDomain + url.substring(url.indexOf('/'));
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
}
return new DomainLinks(links);
}
return new DomainLinks(links);
}
catch (SQLException ex) {
catch (Exception ex) {
logger.warn("Failed to get atags for " + domain, ex);
}

View File

@ -13,7 +13,9 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class AnchorTagsSourceFactory {
private final Path atagsPath;
@ -54,7 +56,12 @@ public class AnchorTagsSourceFactory {
return domain -> new DomainLinks();
}
return new AnchorTagsImpl(atagsPath, relevantDomains);
Set<EdgeDomain> allDomains = new HashSet<>(relevantDomains);
for (var domain : relevantDomains) {
domain.aliasDomain().ifPresent(allDomains::add);
}
return new AnchorTagsImpl(atagsPath, allDomains);
}
// Only get domains that are assigned to this node. This reduces the amount of data