Merge pull request 'Experimental domain-searching feature' (#63) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/63
This commit is contained in:
Viktor Lofgren 2022-07-28 20:23:09 +02:00
commit bbb0bf4b7e
4 changed files with 25 additions and 13 deletions

View File

@ -277,7 +277,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
// this is safe, string cocatenation is of integers
String inStmt = urlId.stream().map(id -> Integer.toString(id.id())).collect(Collectors.joining(", ", "(", ")"));
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL_VIEW INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID WHERE VISITED_URLS<500 AND QUALITY>-10 AND EC_URL_VIEW.ID IN " + inStmt + " ORDER BY RANK ASC");
var rsp = stmt.executeQuery("SELECT DOMAIN_ID, DOMAIN_NAME FROM EC_URL_VIEW INNER JOIN DOMAIN_METADATA ON EC_URL_VIEW.DOMAIN_ID=DOMAIN_METADATA.ID WHERE VISITED_URLS<750 AND QUALITY>-10 AND EC_URL_VIEW.ID IN " + inStmt + " ORDER BY RANK ASC");
while (rsp.next()) {
int id = rsp.getInt(1);
String domain = rsp.getString(2);

View File

@ -65,7 +65,7 @@ public class EdgeIndexClient extends AbstractDynamicClient {
}
@CheckReturnValue
public List<EdgeDomainSearchResults> queryDomains(Context ctx, EdgeDomainSearchSpecification... specs) {
public List<EdgeDomainSearchResults> queryDomains(Context ctx, List<EdgeDomainSearchSpecification> specs) {
return Observable.fromArray(specs)
.concatMap(s -> postGet(ctx, "/search-domain/", s, EdgeDomainSearchResults.class)
.subscribeOn(Schedulers.io())

View File

@ -112,22 +112,34 @@ public class EdgeSearchOperator {
}
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
var requests = specs.subqueries.stream()
List<Integer> buckets = specs.buckets.stream().limit(specs.stagger ? 2 : 1).toList();
List<String> keywords = specs.subqueries.stream()
.filter(sq -> sq.searchTermsExclude.isEmpty() && sq.searchTermsInclude.size() == 1)
.flatMap(sq -> sq.searchTermsInclude.stream())
.map(sq -> sq.searchTermsInclude.get(0))
.distinct()
.flatMap(keyword ->
specs.buckets.stream().map(bucket -> new EdgeDomainSearchSpecification(bucket, IndexBlock.Title, keyword, 2_000_000/specs.buckets.size(), 10, 25))
)
.toArray(EdgeDomainSearchSpecification[]::new);
.toList();
if (requests.length == 0)
List<EdgeDomainSearchSpecification> requests = new ArrayList<>(keywords.size() * buckets.size());
for (var keyword : keywords) {
for (var bucket : buckets) {
requests.add(new EdgeDomainSearchSpecification(bucket, IndexBlock.Title, keyword,
1_000_000, 10, 25));
}
}
if (requests.isEmpty()) {
return Collections.emptyList();
}
List<EdgeId<EdgeUrl>> results = indexClient.queryDomains(ctx, requests)
.stream().flatMap(rs -> rs.results.stream()).distinct().toList();
Set<EdgeId<EdgeUrl>> results = new LinkedHashSet<>();
return edgeDataStoreDao.getBrowseResultFromUrlIds(results);
for (var result : indexClient.queryDomains(ctx, requests)) {
results.addAll(result.results);
}
return edgeDataStoreDao.getBrowseResultFromUrlIds(new ArrayList<>(results));
}
private String getEvalResult(@Nullable Future<String> eval) {