From 667a80a3a02543df79f7574326b534a6aa245e42 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 27 Jul 2022 13:56:03 +0200 Subject: [PATCH] Deduplicate domains in explore mode --- .../search/command/commands/BrowseCommand.java | 14 ++++++++++++-- .../wmsa/edge/search/model/BrowseResult.java | 8 ++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java index 4e711069..ba5714ea 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/BrowseCommand.java @@ -16,8 +16,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.HashSet; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -58,17 +60,25 @@ public class BrowseCommand implements SearchCommandInterface { String definePrefix = "browse:"; String word = humanQuery.substring(definePrefix.length()).toLowerCase(); + Set domainHashes = new HashSet<>(); + try { if ("random".equals(word)) { var results = edgeDataStoreDao.getRandomDomains(25, blacklist); - results.removeIf(res -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))); + + results.removeIf(res -> + !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)) + || !domainHashes.add(res.domainHash())); + return new BrowseResultSet(results); } else { var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word)); var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45); - neighbors.removeIf(res -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))); + neighbors.removeIf(res -> + !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)) + || !domainHashes.add(res.domainHash())); return new BrowseResultSet(neighbors); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java index a27a8f19..948c3b07 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/BrowseResult.java @@ -8,4 +8,12 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl; public class BrowseResult { public final EdgeUrl url; public final int domainId; + + public String domainHash() { + var domain = url.domain; + if ("www".equals(domain.subDomain)) { + return domain.domain; + } + return domain.toString(); + } }