(search) Integrate 'similar' tab in site info.

This commit is contained in:
Viktor Lofgren 2023-11-27 17:45:07 +01:00
parent 97d43a6fa2
commit 902f235b5b
5 changed files with 130 additions and 69 deletions

View File

@ -1,57 +1,34 @@
package nu.marginalia.search.command.commands; package nu.marginalia.search.command.commands;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.DbBrowseDomainsSimilarCosine;
import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet; import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.results.BrowseResultCleaner;
import nu.marginalia.client.Context; import nu.marginalia.client.Context;
import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory; import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.svc.SearchBrowseService;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import spark.Response; import spark.Response;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.Map;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import static java.util.Collections.shuffle;
public class BrowseCommand implements SearchCommandInterface { public class BrowseCommand implements SearchCommandInterface {
private final DbBrowseDomainsRandom randomDomains; private final SearchBrowseService browseService;
private final DbBrowseDomainsSimilarCosine similarDomains;
private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld;
private final DbDomainQueries domainQueries;
private final DomainBlacklist blacklist;
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer; private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
private final BrowseResultCleaner browseResultCleaner;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate(); private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate();
@Inject @Inject
public BrowseCommand(DbBrowseDomainsRandom randomDomains, public BrowseCommand(SearchBrowseService browseService,
DbBrowseDomainsSimilarCosine similarDomains, RendererFactory rendererFactory)
DbBrowseDomainsSimilarOldAlgo similarDomainsOld, DbDomainQueries domainQueries,
DomainBlacklist blacklist,
RendererFactory rendererFactory,
BrowseResultCleaner browseResultCleaner)
throws IOException throws IOException
{ {
this.randomDomains = randomDomains; this.browseService = browseService;
this.similarDomains = similarDomains;
this.similarDomainsOld = similarDomainsOld;
this.domainQueries = domainQueries;
this.blacklist = blacklist;
this.browseResultCleaner = browseResultCleaner;
browseResultsRenderer = rendererFactory.renderer("search/browse-results"); browseResultsRenderer = rendererFactory.renderer("search/browse-results");
} }
@ -82,14 +59,14 @@ public class BrowseCommand implements SearchCommandInterface {
try { try {
if ("random".equals(word)) { if ("random".equals(word)) {
return getRandomEntries(0); return browseService.getRandomEntries(0);
} }
if (word.startsWith("random:")) { if (word.startsWith("random:")) {
int set = Integer.parseInt(word.split(":")[1]); int set = Integer.parseInt(word.split(":")[1]);
return getRandomEntries(set); return browseService.getRandomEntries(set);
} }
else { else {
return getRelatedEntries(word); return browseService.getRelatedEntries(word);
} }
} }
catch (Exception ex) { catch (Exception ex) {
@ -98,34 +75,5 @@ public class BrowseCommand implements SearchCommandInterface {
} }
} }
private BrowseResultSet getRandomEntries(int set) {
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return new BrowseResultSet(results);
}
private BrowseResultSet getRelatedEntries(String word) {
var domain = domainQueries.getDomainId(new EdgeDomain(word));
var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
// If the results are very few, supplement with the alternative shitty algorithm
if (neighbors.size() < 25) {
Set<BrowseResult> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50));
neighbors.clear();
neighbors.addAll(allNeighbors);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
}
// shuffle the items for a less repetitive experience
shuffle(neighbors);
return new BrowseResultSet(neighbors, word);
}
} }

View File

@ -17,7 +17,7 @@ public class SiteRedirectCommand implements SearchCommandInterface {
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^site:[.A-Za-z\\-0-9]+$").asPredicate(); private final Predicate<String> queryPatternPredicate = Pattern.compile("^(site|links|similar):[.A-Za-z\\-0-9]+$").asPredicate();
@Inject @Inject
public SiteRedirectCommand() { public SiteRedirectCommand() {
@ -30,18 +30,24 @@ public class SiteRedirectCommand implements SearchCommandInterface {
return false; return false;
} }
String definePrefix = "site:"; int idx = parameters.query().indexOf(':');
String domain = parameters.query().substring(definePrefix.length()).toLowerCase(); String prefix = parameters.query().substring(0, idx);
String domain = parameters.query().substring(idx + 1).toLowerCase();
// Use an HTML redirect here, so we can use relative URLs // Use an HTML redirect here, so we can use relative URLs
String view = switch (prefix) {
case "links" -> "links";
case "similar" -> "similar";
default -> "info";
};
response.raw().getOutputStream().println(""" response.raw().getOutputStream().println("""
<!DOCTYPE html> <!DOCTYPE html>
<html lang="en"> <html lang="en">
<meta charset="UTF-8"> <meta charset="UTF-8">
<title>Redirecting...</title> <title>Redirecting...</title>
<meta http-equiv="refresh" content="0; url=/site/%s"> <meta http-equiv="refresh" content="0; url=/site/%s?view=%s">
""".formatted(domain)); """.formatted(domain, view));
return true; return true;
} }

View File

@ -0,0 +1,73 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.DbBrowseDomainsSimilarCosine;
import nu.marginalia.browse.DbBrowseDomainsSimilarOldAlgo;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.results.BrowseResultCleaner;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static java.util.Collections.shuffle;
public class SearchBrowseService {
private final DbBrowseDomainsRandom randomDomains;
private final DbBrowseDomainsSimilarCosine similarDomains;
private final DbBrowseDomainsSimilarOldAlgo similarDomainsOld;
private final DbDomainQueries domainQueries;
private final DomainBlacklist blacklist;
private final BrowseResultCleaner browseResultCleaner;
@Inject
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
DbBrowseDomainsSimilarCosine similarDomains,
DbBrowseDomainsSimilarOldAlgo similarDomainsOld,
DbDomainQueries domainQueries,
DomainBlacklist blacklist,
BrowseResultCleaner browseResultCleaner)
{
this.randomDomains = randomDomains;
this.similarDomains = similarDomains;
this.similarDomainsOld = similarDomainsOld;
this.domainQueries = domainQueries;
this.blacklist = blacklist;
this.browseResultCleaner = browseResultCleaner;
}
public BrowseResultSet getRandomEntries(int set) {
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
return new BrowseResultSet(results);
}
public BrowseResultSet getRelatedEntries(String word) {
var domain = domainQueries.getDomainId(new EdgeDomain(word));
var neighbors = similarDomains.getDomainNeighborsAdjacentCosine(domain, blacklist, 256);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
// If the results are very few, supplement with the alternative shitty algorithm
if (neighbors.size() < 25) {
Set<BrowseResult> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(similarDomainsOld.getDomainNeighborsAdjacent(domain, blacklist, 50));
neighbors.clear();
neighbors.addAll(allNeighbors);
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
}
// shuffle the items for a less repetitive experience
shuffle(neighbors);
return new BrowseResultSet(neighbors, word);
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.search.svc; package nu.marginalia.search.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.client.Context; import nu.marginalia.client.Context;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -15,6 +17,7 @@ import spark.*;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.io.IOException; import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.OptionalInt; import java.util.OptionalInt;
@ -25,6 +28,7 @@ public class SearchSiteInfoService {
private final DomainInformationService domainInformationService; private final DomainInformationService domainInformationService;
private final SearchFlagSiteService flagSiteService; private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries; private final DbDomainQueries domainQueries;
private final SearchBrowseService browseService;
private final MustacheRenderer<Object> renderer; private final MustacheRenderer<Object> renderer;
@Inject @Inject
@ -32,13 +36,15 @@ public class SearchSiteInfoService {
DomainInformationService domainInformationService, DomainInformationService domainInformationService,
RendererFactory rendererFactory, RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService, SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries) throws IOException { DbDomainQueries domainQueries, SearchBrowseService browseService) throws IOException {
this.searchOperator = searchOperator; this.searchOperator = searchOperator;
this.domainInformationService = domainInformationService; this.domainInformationService = domainInformationService;
this.flagSiteService = flagSiteService; this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries; this.domainQueries = domainQueries;
this.renderer = rendererFactory.renderer("search/site-info/site-info"); this.renderer = rendererFactory.renderer("search/site-info/site-info");
this.browseService = browseService;
} }
public Object handle(Request request, Response response) throws SQLException { public Object handle(Request request, Response response) throws SQLException {
@ -55,6 +61,7 @@ public class SearchSiteInfoService {
case "links" -> listLinks(ctx, domainName); case "links" -> listLinks(ctx, domainName);
case "docs" -> listDocs(ctx, domainName); case "docs" -> listDocs(ctx, domainName);
case "info" -> siteInfo(ctx, domainName); case "info" -> siteInfo(ctx, domainName);
case "similar" -> listSimilar(ctx, domainName);
case "report" -> reportSite(ctx, domainName); case "report" -> reportSite(ctx, domainName);
default -> siteInfo(ctx, domainName); default -> siteInfo(ctx, domainName);
}; };
@ -129,7 +136,12 @@ public class SearchSiteInfoService {
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doBacklinkSearch(ctx, domainName)); searchOperator.doBacklinkSearch(ctx, domainName));
} }
private SimilarSites listSimilar(Context ctx, String domainName) {
return new SimilarSites(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
browseService.getRelatedEntries(domainName));
}
private Docs listDocs(Context ctx, String domainName) { private Docs listDocs(Context ctx, String domainName) {
return new Docs(domainName, return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
@ -210,6 +222,18 @@ public class SearchSiteInfoService {
} }
} }
public record SimilarSites(Map<String, Boolean> view, String domain, long domainId, List<BrowseResult> results) {
public SimilarSites(String domain, long domainId, BrowseResultSet results) {
this(Map.of("similar", true), domain, domainId, new ArrayList<>(results.results()));
}
public String query() { return "similar:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record ReportDomain( public record ReportDomain(
Map<String, Boolean> view, Map<String, Boolean> view,
String domain, String domain,

View File

@ -42,6 +42,7 @@
</div> </div>
{{#each results}}{{>search/parts/search-result}}{{/each}} {{#each results}}{{>search/parts/search-result}}{{/each}}
{{/if}} {{/if}}
{{#if view.docs}} {{#if view.docs}}
<div class="infobox"> <div class="infobox">
Showing documents found in {{domain}}. Showing documents found in {{domain}}.
@ -49,9 +50,18 @@
{{#each results}}{{>search/parts/search-result}}{{/each}} {{#each results}}{{>search/parts/search-result}}{{/each}}
{{/if}} {{/if}}
{{#if view.report}} {{#if view.report}}
{{>search/site-info/site-info-report}} {{>search/site-info/site-info-report}}
{{/if}} {{/if}}
{{#if view.similar}}
<div class="infobox">Showing domains similar to {{domain}}</div>
<section class="cards">
{{#each results}}{{>search/browse-result}}{{/each}}
</section>
{{/if}}
{{>search/parts/search-footer}} {{>search/parts/search-footer}}
</body> </body>