mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(atag) Add alias domain support and improve domain handling
Introduced optional alias domain functionality in EdgeDomain class to handle domain variations such as "www" in the anchor tags code, as there are commonly a number of relevant but glancing misses in the atags data.
This commit is contained in:
parent
d4bce13a03
commit
52bc0272f8
@ -3,6 +3,7 @@ package nu.marginalia.model;
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -133,6 +134,18 @@ public class EdgeDomain implements Serializable {
|
||||
return ret.toString().toLowerCase();
|
||||
}
|
||||
|
||||
/** If possible, try to provide an alias domain,
|
||||
* i.e. a domain name that is very likely to link to this one
|
||||
* */
|
||||
public Optional<EdgeDomain> aliasDomain() {
|
||||
if (subDomain.equals("www")) {
|
||||
return Optional.of(new EdgeDomain("", topDomain));
|
||||
} else if (subDomain.isBlank()){
|
||||
return Optional.of(new EdgeDomain("www", topDomain));
|
||||
}
|
||||
else return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public boolean hasSameTopDomain(EdgeDomain other) {
|
||||
if (other == null) return false;
|
||||
|
@ -12,13 +12,16 @@ import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
public class AnchorTagsImpl implements AnchorTagsSource {
|
||||
private final Connection duckdbConnection;
|
||||
private static final Logger logger = LoggerFactory.getLogger(AnchorTagsImpl.class);
|
||||
|
||||
public AnchorTagsImpl(Path atagsPath,
|
||||
List<EdgeDomain> relevantDomains)
|
||||
Collection<EdgeDomain> relevantDomains)
|
||||
throws SQLException
|
||||
{
|
||||
duckdbConnection = DriverManager.getConnection("jdbc:duckdb:");
|
||||
@ -82,14 +85,30 @@ public class AnchorTagsImpl implements AnchorTagsSource {
|
||||
where dest = ?
|
||||
"""))
|
||||
{
|
||||
// Add links to the provided domain
|
||||
ps.setString(1, domain.toString());
|
||||
var rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
links.add(new LinkWithText(rs.getString("url"), rs.getString("text"), rs.getString("source")));
|
||||
}
|
||||
|
||||
// Also look for links to an aliased domain, e.g. maybe the domain is marginalia.nu but the link is to www.marginalia.nu?
|
||||
Optional<EdgeDomain> aliasDomain = domain.aliasDomain();
|
||||
if (aliasDomain.isPresent()) {
|
||||
ps.setString(1, aliasDomain.get().toString());
|
||||
rs = ps.executeQuery();
|
||||
while (rs.next()) {
|
||||
// Change the domain name in the URL to the alias domain
|
||||
String url = rs.getString("url");
|
||||
url = aliasDomain + url.substring(url.indexOf('/'));
|
||||
|
||||
links.add(new LinkWithText(url, rs.getString("text"), rs.getString("source")));
|
||||
}
|
||||
return new DomainLinks(links);
|
||||
}
|
||||
return new DomainLinks(links);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to get atags for " + domain, ex);
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,9 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class AnchorTagsSourceFactory {
|
||||
private final Path atagsPath;
|
||||
@ -54,7 +56,12 @@ public class AnchorTagsSourceFactory {
|
||||
return domain -> new DomainLinks();
|
||||
}
|
||||
|
||||
return new AnchorTagsImpl(atagsPath, relevantDomains);
|
||||
Set<EdgeDomain> allDomains = new HashSet<>(relevantDomains);
|
||||
for (var domain : relevantDomains) {
|
||||
domain.aliasDomain().ifPresent(allDomains::add);
|
||||
}
|
||||
|
||||
return new AnchorTagsImpl(atagsPath, allDomains);
|
||||
}
|
||||
|
||||
// Only get domains that are assigned to this node. This reduces the amount of data
|
||||
|
Loading…
Reference in New Issue
Block a user