From 902f235b5b0b4b6d99a8c3e2b3abe1bb424c390a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 27 Nov 2023 17:45:07 +0100 Subject: [PATCH] (search) Integrate 'similar' tab in site info. --- .../command/commands/BrowseCommand.java | 74 +++---------------- .../command/commands/SiteRedirectCommand.java | 16 ++-- .../search/svc/SearchBrowseService.java | 73 ++++++++++++++++++ .../search/svc/SearchSiteInfoService.java | 26 ++++++- .../templates/search/site-info/site-info.hdb | 10 +++ 5 files changed, 130 insertions(+), 69 deletions(-) create mode 100644 code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java index 38ae63f2..0eb6a3b3 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/BrowseCommand.java @@ -1,57 +1,34 @@ package nu.marginalia.search.command.commands; import com.google.inject.Inject; -import nu.marginalia.browse.DbBrowseDomainsRandom; -import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; -import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo; -import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResultSet; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.db.DbDomainQueries; -import nu.marginalia.db.DomainBlacklist; -import nu.marginalia.search.command.SearchCommandInterface; -import nu.marginalia.search.command.SearchParameters; -import nu.marginalia.search.results.BrowseResultCleaner; import nu.marginalia.client.Context; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.svc.SearchBrowseService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Response; import java.io.IOException; -import java.util.*; +import java.util.Map; import java.util.function.Predicate; import java.util.regex.Pattern; -import static java.util.Collections.shuffle; - public class BrowseCommand implements SearchCommandInterface { - private final DbBrowseDomainsRandom randomDomains; - private final DbBrowseDomainsSimilarCosine similarDomains; - private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld; - private final DbDomainQueries domainQueries; - private final DomainBlacklist blacklist; + private final SearchBrowseService browseService; private final MustacheRenderer browseResultsRenderer; - private final BrowseResultCleaner browseResultCleaner; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Predicate queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate(); @Inject - public BrowseCommand(DbBrowseDomainsRandom randomDomains, - DbBrowseDomainsSimilarCosine similarDomains, - DbBrowseDomainsSimilarOldAlgo similarDomainsOld, DbDomainQueries domainQueries, - DomainBlacklist blacklist, - RendererFactory rendererFactory, - BrowseResultCleaner browseResultCleaner) + public BrowseCommand(SearchBrowseService browseService, + RendererFactory rendererFactory) throws IOException { - this.randomDomains = randomDomains; - this.similarDomains = similarDomains; - this.similarDomainsOld = similarDomainsOld; - this.domainQueries = domainQueries; - this.blacklist = blacklist; - this.browseResultCleaner = browseResultCleaner; + this.browseService = browseService; browseResultsRenderer = rendererFactory.renderer("search/browse-results"); } @@ -82,14 +59,14 @@ public class BrowseCommand implements SearchCommandInterface { try { if ("random".equals(word)) { - return getRandomEntries(0); + return browseService.getRandomEntries(0); } if (word.startsWith("random:")) { int set = Integer.parseInt(word.split(":")[1]); - return getRandomEntries(set); + return browseService.getRandomEntries(set); } else { - return getRelatedEntries(word); + return browseService.getRelatedEntries(word); } } catch (Exception ex) { @@ -98,34 +75,5 @@ public class BrowseCommand implements SearchCommandInterface { } } - private BrowseResultSet getRandomEntries(int set) { - List results = randomDomains.getRandomDomains(25, blacklist, set); - - results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); - - return new BrowseResultSet(results); - } - - private BrowseResultSet getRelatedEntries(String word) { - var domain = domainQueries.getDomainId(new EdgeDomain(word)); - - var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256); - neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); - - // If the results are very few, supplement with the alternative shitty algorithm - if (neighbors.size() < 25) { - Set allNeighbors = new HashSet<>(neighbors); - allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50)); - - neighbors.clear(); - neighbors.addAll(allNeighbors); - neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); - } - - // shuffle the items for a less repetitive experience - shuffle(neighbors); - - return new BrowseResultSet(neighbors, word); - } } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java index d3398bb1..881a7551 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java @@ -17,7 +17,7 @@ public class SiteRedirectCommand implements SearchCommandInterface { private final Logger logger = LoggerFactory.getLogger(getClass()); - private final Predicate queryPatternPredicate = Pattern.compile("^site:[.A-Za-z\\-0-9]+$").asPredicate(); + private final Predicate queryPatternPredicate = Pattern.compile("^(site|links|similar):[.A-Za-z\\-0-9]+$").asPredicate(); @Inject public SiteRedirectCommand() { @@ -30,18 +30,24 @@ public class SiteRedirectCommand implements SearchCommandInterface { return false; } - String definePrefix = "site:"; - String domain = parameters.query().substring(definePrefix.length()).toLowerCase(); + int idx = parameters.query().indexOf(':'); + String prefix = parameters.query().substring(0, idx); + String domain = parameters.query().substring(idx + 1).toLowerCase(); // Use an HTML redirect here, so we can use relative URLs + String view = switch (prefix) { + case "links" -> "links"; + case "similar" -> "similar"; + default -> "info"; + }; response.raw().getOutputStream().println(""" Redirecting... - - """.formatted(domain)); + + """.formatted(domain, view)); return true; } diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java new file mode 100644 index 00000000..1064eca1 --- /dev/null +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchBrowseService.java @@ -0,0 +1,73 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.browse.DbBrowseDomainsRandom; +import nu.marginalia.browse.DbBrowseDomainsSimilarCosine; +import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.search.results.BrowseResultCleaner; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static java.util.Collections.shuffle; + +public class SearchBrowseService { + private final DbBrowseDomainsRandom randomDomains; + private final DbBrowseDomainsSimilarCosine similarDomains; + private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld; + private final DbDomainQueries domainQueries; + private final DomainBlacklist blacklist; + private final BrowseResultCleaner browseResultCleaner; + + @Inject + public SearchBrowseService(DbBrowseDomainsRandom randomDomains, + DbBrowseDomainsSimilarCosine similarDomains, + DbBrowseDomainsSimilarOldAlgo similarDomainsOld, + DbDomainQueries domainQueries, + DomainBlacklist blacklist, + BrowseResultCleaner browseResultCleaner) + { + this.randomDomains = randomDomains; + this.similarDomains = similarDomains; + this.similarDomainsOld = similarDomainsOld; + this.domainQueries = domainQueries; + this.blacklist = blacklist; + this.browseResultCleaner = browseResultCleaner; + } + + public BrowseResultSet getRandomEntries(int set) { + List results = randomDomains.getRandomDomains(25, blacklist, set); + + results.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + + return new BrowseResultSet(results); + } + + public BrowseResultSet getRelatedEntries(String word) { + var domain = domainQueries.getDomainId(new EdgeDomain(word)); + + var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256); + neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + + // If the results are very few, supplement with the alternative shitty algorithm + if (neighbors.size() < 25) { + Set allNeighbors = new HashSet<>(neighbors); + allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50)); + + neighbors.clear(); + neighbors.addAll(allNeighbors); + neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate()); + } + + // shuffle the items for a less repetitive experience + shuffle(neighbors); + + return new BrowseResultSet(neighbors, word); + } +} diff --git a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java index 1be81784..752047b9 100644 --- a/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/src/main/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -1,5 +1,7 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResultSet; import nu.marginalia.client.Context; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.model.EdgeDomain; @@ -15,6 +17,7 @@ import spark.*; import javax.annotation.Nullable; import java.io.IOException; import java.sql.SQLException; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.OptionalInt; @@ -25,6 +28,7 @@ public class SearchSiteInfoService { private final DomainInformationService domainInformationService; private final SearchFlagSiteService flagSiteService; private final DbDomainQueries domainQueries; + private final SearchBrowseService browseService; private final MustacheRenderer renderer; @Inject @@ -32,13 +36,15 @@ public class SearchSiteInfoService { DomainInformationService domainInformationService, RendererFactory rendererFactory, SearchFlagSiteService flagSiteService, - DbDomainQueries domainQueries) throws IOException { + DbDomainQueries domainQueries, SearchBrowseService browseService) throws IOException { this.searchOperator = searchOperator; this.domainInformationService = domainInformationService; this.flagSiteService = flagSiteService; this.domainQueries = domainQueries; this.renderer = rendererFactory.renderer("search/site-info/site-info"); + this.browseService = browseService; + } public Object handle(Request request, Response response) throws SQLException { @@ -55,6 +61,7 @@ public class SearchSiteInfoService { case "links" -> listLinks(ctx, domainName); case "docs" -> listDocs(ctx, domainName); case "info" -> siteInfo(ctx, domainName); + case "similar" -> listSimilar(ctx, domainName); case "report" -> reportSite(ctx, domainName); default -> siteInfo(ctx, domainName); }; @@ -129,7 +136,12 @@ public class SearchSiteInfoService { domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), searchOperator.doBacklinkSearch(ctx, domainName)); } + private SimilarSites listSimilar(Context ctx, String domainName) { + return new SimilarSites(domainName, + domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), + browseService.getRelatedEntries(domainName)); + } private Docs listDocs(Context ctx, String domainName) { return new Docs(domainName, domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), @@ -210,6 +222,18 @@ public class SearchSiteInfoService { } } + public record SimilarSites(Map view, String domain, long domainId, List results) { + public SimilarSites(String domain, long domainId, BrowseResultSet results) { + this(Map.of("similar", true), domain, domainId, new ArrayList<>(results.results())); + } + + public String query() { return "similar:" + domain; } + + public boolean isKnown() { + return domainId > 0; + } + } + public record ReportDomain( Map view, String domain, diff --git a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb index 498de21b..afda391c 100644 --- a/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb +++ b/code/services-application/search-service/src/main/resources/templates/search/site-info/site-info.hdb @@ -42,6 +42,7 @@ {{#each results}}{{>search/parts/search-result}}{{/each}} {{/if}} + {{#if view.docs}}
Showing documents found in {{domain}}. @@ -49,9 +50,18 @@ {{#each results}}{{>search/parts/search-result}}{{/each}} {{/if}} + {{#if view.report}} {{>search/site-info/site-info-report}} {{/if}} + +{{#if view.similar}} +
Showing domains similar to {{domain}}
+
+ {{#each results}}{{>search/browse-result}}{{/each}} +
+{{/if}} + {{>search/parts/search-footer}}