mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Experimental domain-searching feature
This commit is contained in:
parent
092045ae8e
commit
bf328a0597
@ -2,14 +2,13 @@ package nu.marginalia.wmsa.edge.data.dao;
|
|||||||
|
|
||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.*;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||||
public interface EdgeDataStoreDao {
|
public interface EdgeDataStoreDao {
|
||||||
@ -18,6 +17,9 @@ public interface EdgeDataStoreDao {
|
|||||||
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
List<BrowseResult> getDomainNeighborsAdjacent(EdgeId<EdgeDomain> domainId, EdgeDomainBlacklist backlist, int count);
|
||||||
|
|
||||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist);
|
||||||
|
|
||||||
|
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
|
||||||
|
|
||||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||||
|
|
||||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||||
|
@ -19,6 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||||
@ -264,6 +265,31 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
return domains;
|
return domains;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId) {
|
||||||
|
List<BrowseResult> ret = new ArrayList<>(urlId.size());
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection()) {
|
||||||
|
try (var stmt = conn.createStatement()) {
|
||||||
|
// this is safe, string cocatenation is of integers
|
||||||
|
String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));
|
||||||
|
|
||||||
|
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL WHERE ID IN " + inStmt);
|
||||||
|
while (rsp.next()) {
|
||||||
|
int id = rsp.getInt(1);
|
||||||
|
String domain = rsp.getString(2);
|
||||||
|
|
||||||
|
ret.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("SQL error", ex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.model.search;
|
|||||||
|
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols;
|
import nu.marginalia.wmsa.edge.search.model.EdgeSearchRankingSymbols;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -21,12 +21,9 @@ public class EdgeUrlDetails {
|
|||||||
public String format;
|
public String format;
|
||||||
public int features;
|
public int features;
|
||||||
|
|
||||||
|
public String ip;
|
||||||
|
|
||||||
public String ip; // BROKEN
|
|
||||||
public EdgeDomainIndexingState domainState;
|
public EdgeDomainIndexingState domainState;
|
||||||
|
|
||||||
|
|
||||||
public int dataHash;
|
public int dataHash;
|
||||||
|
|
||||||
public EdgePageScoreAdjustment urlQualityAdjustment;
|
public EdgePageScoreAdjustment urlQualityAdjustment;
|
||||||
|
@ -15,6 +15,8 @@ import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
|||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
||||||
@ -97,15 +99,32 @@ public class EdgeSearchOperator {
|
|||||||
|
|
||||||
String evalResult = getEvalResult(eval);
|
String evalResult = getEvalResult(eval);
|
||||||
|
|
||||||
|
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
|
||||||
|
|
||||||
return new DecoratedSearchResults(params,
|
return new DecoratedSearchResults(params,
|
||||||
getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery),
|
getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery),
|
||||||
evalResult,
|
evalResult,
|
||||||
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
|
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
|
||||||
queryResults.resultSet,
|
queryResults.resultSet,
|
||||||
|
domainResults,
|
||||||
processedQuery.domain,
|
processedQuery.domain,
|
||||||
getDomainId(processedQuery.domain));
|
getDomainId(processedQuery.domain));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
|
||||||
|
List<String> keywords = specs.subqueries.stream().filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1)
|
||||||
|
.findFirst().map(sq -> sq.searchTermsExclude).orElseGet(Collections::emptyList);
|
||||||
|
|
||||||
|
if (keywords.size() == 1) {
|
||||||
|
var request = new EdgeDomainSearchSpecification(specs.buckets.get(0), IndexBlock.TitleKeywords, keywords.get(0), 10_000, 10, 5);
|
||||||
|
var response = indexClient.queryDomains(ctx, request);
|
||||||
|
|
||||||
|
return edgeDataStoreDao.getBrowseResultFromUrlIds(response.results);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
private String getEvalResult(@Nullable Future<String> eval) {
|
private String getEvalResult(@Nullable Future<String> eval) {
|
||||||
if (eval == null || eval.isCancelled()) {
|
if (eval == null || eval.isCancelled()) {
|
||||||
return "";
|
return "";
|
||||||
|
@ -6,10 +6,10 @@ import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
|||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
||||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResultSet;
|
||||||
|
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -28,6 +28,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
|||||||
private final ScreenshotService screenshotService;
|
private final ScreenshotService screenshotService;
|
||||||
private final EdgeDomainBlacklist blacklist;
|
private final EdgeDomainBlacklist blacklist;
|
||||||
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
|
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
|
||||||
|
private final BrowseResultCleaner browseResultCleaner;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9]+$").asPredicate();
|
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9]+$").asPredicate();
|
||||||
|
|
||||||
@ -35,12 +36,14 @@ public class BrowseCommand implements SearchCommandInterface {
|
|||||||
public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao,
|
public BrowseCommand(EdgeDataStoreDao edgeDataStoreDao,
|
||||||
ScreenshotService screenshotService,
|
ScreenshotService screenshotService,
|
||||||
EdgeDomainBlacklist blacklist,
|
EdgeDomainBlacklist blacklist,
|
||||||
RendererFactory rendererFactory)
|
RendererFactory rendererFactory,
|
||||||
|
BrowseResultCleaner browseResultCleaner)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
this.edgeDataStoreDao = edgeDataStoreDao;
|
this.edgeDataStoreDao = edgeDataStoreDao;
|
||||||
this.screenshotService = screenshotService;
|
this.screenshotService = screenshotService;
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
|
this.browseResultCleaner = browseResultCleaner;
|
||||||
|
|
||||||
browseResultsRenderer = rendererFactory.renderer("edge/browse-results");
|
browseResultsRenderer = rendererFactory.renderer("edge/browse-results");
|
||||||
}
|
}
|
||||||
@ -66,9 +69,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
|||||||
if ("random".equals(word)) {
|
if ("random".equals(word)) {
|
||||||
var results = edgeDataStoreDao.getRandomDomains(25, blacklist);
|
var results = edgeDataStoreDao.getRandomDomains(25, blacklist);
|
||||||
|
|
||||||
results.removeIf(res ->
|
results.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||||
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
|
||||||
|| !domainHashes.add(res.domainHash()));
|
|
||||||
|
|
||||||
return new BrowseResultSet(results);
|
return new BrowseResultSet(results);
|
||||||
}
|
}
|
||||||
@ -76,9 +77,7 @@ public class BrowseCommand implements SearchCommandInterface {
|
|||||||
var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word));
|
var domain = edgeDataStoreDao.getDomainId(new EdgeDomain(word));
|
||||||
var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45);
|
var neighbors = edgeDataStoreDao.getDomainNeighborsAdjacent(domain, blacklist, 45);
|
||||||
|
|
||||||
neighbors.removeIf(res ->
|
neighbors.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||||
!screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
|
||||||
|| !domainHashes.add(res.domainHash()));
|
|
||||||
|
|
||||||
return new BrowseResultSet(neighbors);
|
return new BrowseResultSet(neighbors);
|
||||||
}
|
}
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
|||||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
||||||
|
import nu.marginalia.wmsa.edge.search.results.BrowseResultCleaner;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||||
|
|
||||||
@ -24,17 +25,21 @@ public class SearchCommand implements SearchCommandInterface {
|
|||||||
private final EdgeSearchOperator searchOperator;
|
private final EdgeSearchOperator searchOperator;
|
||||||
private final UnitConversion unitConversion;
|
private final UnitConversion unitConversion;
|
||||||
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
|
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
|
||||||
|
private BrowseResultCleaner browseResultCleaner;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchCommand(EdgeDomainBlacklist blacklist,
|
public SearchCommand(EdgeDomainBlacklist blacklist,
|
||||||
EdgeDataStoreDao dataStoreDao,
|
EdgeDataStoreDao dataStoreDao,
|
||||||
EdgeSearchOperator searchOperator,
|
EdgeSearchOperator searchOperator,
|
||||||
UnitConversion unitConversion,
|
UnitConversion unitConversion,
|
||||||
RendererFactory rendererFactory) throws IOException {
|
RendererFactory rendererFactory,
|
||||||
|
BrowseResultCleaner browseResultCleaner
|
||||||
|
) throws IOException {
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.dataStoreDao = dataStoreDao;
|
this.dataStoreDao = dataStoreDao;
|
||||||
this.searchOperator = searchOperator;
|
this.searchOperator = searchOperator;
|
||||||
this.unitConversion = unitConversion;
|
this.unitConversion = unitConversion;
|
||||||
|
this.browseResultCleaner = browseResultCleaner;
|
||||||
|
|
||||||
searchResultsRenderer = rendererFactory.renderer("edge/search-results");
|
searchResultsRenderer = rendererFactory.renderer("edge/search-results");
|
||||||
}
|
}
|
||||||
@ -46,7 +51,8 @@ public class SearchCommand implements SearchCommandInterface {
|
|||||||
EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js());
|
EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js());
|
||||||
DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval);
|
DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval);
|
||||||
|
|
||||||
results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
|
results.results.removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));
|
||||||
|
results.domainResults.removeIf(browseResultCleaner.shouldRemoveResultPredicate());
|
||||||
|
|
||||||
return Optional.of(searchResultsRenderer.render(results));
|
return Optional.of(searchResultsRenderer.render(results));
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,9 @@ public class DecoratedSearchResults {
|
|||||||
private final List<String> problems;
|
private final List<String> problems;
|
||||||
private final String evalResult;
|
private final String evalResult;
|
||||||
private final WikiArticles wiki;
|
private final WikiArticles wiki;
|
||||||
private final List<EdgeUrlDetails> results;
|
|
||||||
|
public final List<EdgeUrlDetails> results;
|
||||||
|
public final List<BrowseResult> domainResults;
|
||||||
|
|
||||||
private final String focusDomain;
|
private final String focusDomain;
|
||||||
private final int focusDomainId;
|
private final int focusDomainId;
|
||||||
|
@ -0,0 +1,28 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.search.results;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class BrowseResultCleaner {
|
||||||
|
private final ScreenshotService screenshotService;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public BrowseResultCleaner(ScreenshotService screenshotService) {
|
||||||
|
this.screenshotService = screenshotService;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Predicate<BrowseResult> shouldRemoveResultPredicate() {
|
||||||
|
Set<String> domainHashes = new HashSet<>(100);
|
||||||
|
|
||||||
|
return (res) -> !screenshotService.hasScreenshot(new EdgeId<>(res.domainId))
|
||||||
|
|| !domainHashes.add(res.domainHash());
|
||||||
|
}
|
||||||
|
}
|
@ -37,6 +37,7 @@
|
|||||||
</section>
|
</section>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
|
||||||
|
{{#each domainResults}}{{>edge/browse-result}}{{/each}}
|
||||||
{{#each results}}{{>edge/search-result}}{{/each}}
|
{{#each results}}{{>edge/search-result}}{{/each}}
|
||||||
|
|
||||||
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
|
{{#unless evalResult}}{{#if problems}}<section class="card problems onlyscreen"><h2>Suggestions</h2><ul class="onlyscreen search-problems">{{#each problems}}<li>{{{.}}}</li>{{/each}}</ul></section> {{/if}}{{/unless}}
|
||||||
|
Loading…
Reference in New Issue
Block a user