Deduplicate domains in explore mode

This commit is contained in:
vlofgren 2022-07-27 13:56:03 +02:00
parent e3b017e907
commit 667a80a3a0
2 changed files with 20 additions and 2 deletions

View File

@ -16,8 +16,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -58,17 +60,25 @@ public class BrowseCommand implements SearchCommandInterface {
String definePrefix = "browse:"; String definePrefix = "browse:";
String word = humanQuery.substring(definePrefix.length()).toLowerCase(); String word = humanQuery.substring(definePrefix.length()).toLowerCase();
Set<String> domainHashes = new HashSet<>();
try { try {
if ("random".equals(word)) { if ("random".equals(word)) {
var results = edgeDataStoreDao.getRandomDomains(25, blacklist); var results = edgeDataStoreDao.getRandomDomains(25, blacklist);
results.removeIf(res -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId)));
results.removeIf(res ->
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|| !domainHashes.add(res.domainHash()));
return new BrowseResultSet(results); return new BrowseResultSet(results);
} }
else { else {
var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word)); var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word));
var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45); var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45);
neighbors.removeIf(res -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))); neighbors.removeIf(res ->
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|| !domainHashes.add(res.domainHash()));
return new BrowseResultSet(neighbors); return new BrowseResultSet(neighbors);
} }

View File

@ -8,4 +8,12 @@ import nu.marginalia.wmsa.edge.model.EdgeUrl;
public class BrowseResult { public class BrowseResult {
public final EdgeUrl url; public final EdgeUrl url;
public final int domainId; public final int domainId;
public String domainHash() {
var domain = url.domain;
if ("www".equals(domain.subDomain)) {
return domain.domain;
}
return domain.toString();
}
} }