Fix exclude term duplication from js flag.

This commit is contained in:
vlofgren 2022-07-28 14:51:51 +02:00
parent 0903d9f727
commit fd1f3f796e
10 changed files with 69 additions and 48 deletions

View File

@ -1,9 +1,12 @@
package nu.marginalia.wmsa.edge.model.search; package nu.marginalia.wmsa.edge.model.search;
import lombok.*; import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.util.List; import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
@ToString @ToString
@Getter @Getter
@ -23,7 +26,10 @@ public class EdgeSearchSubquery {
} }
public EdgeSearchSubquery withBlock(IndexBlock block) { public EdgeSearchSubquery withBlock(IndexBlock block) {
return new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, block); return new EdgeSearchSubquery(
new CopyOnWriteArrayList<>(searchTermsInclude),
new CopyOnWriteArrayList<>(searchTermsExclude),
block);
} }
public int termSize() { public int termSize() {

View File

@ -88,7 +88,7 @@ public class EdgeSearchOperator {
} }
public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) { public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) {
Observable<WikiArticles> definitions = getWikiArticle(ctx, params.getHumanQuery()); Observable<WikiArticles> definitions = getWikiArticle(ctx, params.humanQuery());
EdgeSearchQuery processedQuery = queryFactory.createQuery(params); EdgeSearchQuery processedQuery = queryFactory.createQuery(params);
logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
@ -98,7 +98,7 @@ public class EdgeSearchOperator {
String evalResult = getEvalResult(eval); String evalResult = getEvalResult(eval);
return new DecoratedSearchResults(params, return new DecoratedSearchResults(params,
getProblems(ctx, params.getHumanQuery(), evalResult, queryResults, processedQuery), getProblems(ctx, params.humanQuery(), evalResult, queryResults, processedQuery),
evalResult, evalResult,
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(), definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
queryResults.resultSet, queryResults.resultSet,

View File

@ -3,49 +3,39 @@ package nu.marginalia.wmsa.edge.search;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public enum EdgeSearchProfile { public enum EdgeSearchProfile {
DEFAULT("default", DEFAULT("default",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
0, 1), 0, 1),
MODERN("modern", MODERN("modern",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
2), 2),
CORPO("corpo", CORPO("corpo",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
4, 5, 6, 7), 4, 5, 6, 7),
YOLO("yolo", YOLO("yolo",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
0, 2, 1, 3, 4, 6), 0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean", CORPO_CLEAN("corpo-clean",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
4, 5), 4, 5),
ACADEMIA("academia", ACADEMIA("academia",
Collections.emptyList(),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
3), 3),
; ;
public final String name; public final String name;
public final List<String> additionalSearchTerm;
public final List<Integer> buckets; public final List<Integer> buckets;
public final List<IndexBlock> indexBlocks; public final List<IndexBlock> indexBlocks;
EdgeSearchProfile(String name, EdgeSearchProfile(String name,
List<String> additionalSearchTerm,
List<IndexBlock> indexBlocks, List<IndexBlock> indexBlocks,
int... buckets) { int... buckets) {
this.name = name; this.name = name;
this.additionalSearchTerm = additionalSearchTerm;
this.indexBlocks = indexBlocks; this.indexBlocks = indexBlocks;
this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList()); this.buckets = Arrays.stream(buckets).boxed().collect(Collectors.toList());
} }

View File

@ -15,6 +15,7 @@ import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.search.command.CommandEvaluator; import nu.marginalia.wmsa.edge.search.command.CommandEvaluator;
import nu.marginalia.wmsa.edge.search.command.SearchJsParameter;
import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; import nu.marginalia.wmsa.edge.search.exceptions.RedirectException;
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
@ -130,7 +131,7 @@ public class EdgeSearchService extends Service {
final String humanQuery = queryParam.trim(); final String humanQuery = queryParam.trim();
var results = searchOperator.doApiSearch(ctx, new EdgeUserSearchParameters(humanQuery, profile, "")); var results = searchOperator.doApiSearch(ctx, new EdgeUserSearchParameters(humanQuery, profile, SearchJsParameter.DEFAULT));
return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(ApiSearchResult::new).limit(limit).collect(Collectors.toList())); return new ApiSearchResults("RESTRICTED", humanQuery, results.stream().map(ApiSearchResult::new).limit(limit).collect(Collectors.toList()));
} }
@ -151,7 +152,9 @@ public class EdgeSearchService extends Service {
var params = new SearchParameters( var params = new SearchParameters(
EdgeSearchProfile.getSearchProfile(profileStr), EdgeSearchProfile.getSearchProfile(profileStr),
Optional.ofNullable(request.queryParams("js")).orElse("default")); SearchJsParameter.parse(request.queryParams("js"))
);
try { try {
return searchCommandEvaulator.eval(ctx, params, humanQuery); return searchCommandEvaulator.eval(ctx, params, humanQuery);
} }

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.edge.search.command;
import javax.annotation.Nullable;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
public final String value;
public final String[] implictExcludeSearchTerms;
SearchJsParameter(String value, String... implictExcludeSearchTerms) {
this.value = value;
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
}
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.command;
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
public record SearchParameters(EdgeSearchProfile profile, String js) { public record SearchParameters(EdgeSearchProfile profile, SearchJsParameter js) {
public String profileStr() { public String profileStr() {
return profile.name; return profile.name;
} }

View File

@ -43,7 +43,8 @@ public class SearchCommand implements SearchCommandInterface {
public Optional<Object> process(Context ctx, SearchParameters parameters, String query) { public Optional<Object> process(Context ctx, SearchParameters parameters, String query) {
@CheckForNull Future<String> eval = unitConversion.tryEval(ctx, query); @CheckForNull Future<String> eval = unitConversion.tryEval(ctx, query);
DecoratedSearchResults results = searchOperator.doSearch(ctx, new EdgeUserSearchParameters(query, parameters.profile(), parameters.js()), eval); EdgeUserSearchParameters params = new EdgeUserSearchParameters(query, parameters.profile(), parameters.js());
DecoratedSearchResults results = searchOperator.doSearch(ctx, params, eval);
results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain))); results.getResults().removeIf(detail -> blacklist.isBlacklisted(dataStoreDao.getDomainId(detail.url.domain)));

View File

@ -20,12 +20,12 @@ public class DecoratedSearchResults {
private final int focusDomainId; private final int focusDomainId;
public String getQuery() { public String getQuery() {
return params.humanQuery; return params.humanQuery();
} }
public String getProfile() { public String getProfile() {
return params.getProfile().name; return params.profile().name;
} }
public String getJs() { public String getJs() {
return params.jsSetting; return params.jsSetting().value;
} }
} }

View File

@ -2,9 +2,9 @@ package nu.marginalia.wmsa.edge.search.query;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -39,15 +39,26 @@ public class QueryFactory {
} }
public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) { public EdgeSearchQuery createQuery(EdgeUserSearchParameters params) {
final var profile = params.getProfile(); final var profile = params.profile();
final var jsSetting = params.getJsSetting();
final var processedQuery = createQuery(getParser(), params); final var processedQuery = createQuery(getParser(), params);
processedQuery.specs.experimental = EdgeSearchProfile.CORPO.equals(profile); processedQuery.specs.experimental = EdgeSearchProfile.CORPO.equals(profile);
processedQuery.specs.stagger = EdgeSearchProfile.YOLO.equals(profile); processedQuery.specs.stagger = EdgeSearchProfile.YOLO.equals(profile);
List<EdgeSearchSubquery> subqueries = new ArrayList<>(processedQuery.specs.subqueries.size() * profile.indexBlocks.size()); final var newSubqueries = reevaluateSubqueries(processedQuery, params);
processedQuery.specs.subqueries.clear();
processedQuery.specs.subqueries.addAll(newSubqueries);
return processedQuery;
}
private List<EdgeSearchSubquery> reevaluateSubqueries(EdgeSearchQuery processedQuery, EdgeUserSearchParameters params) {
final var jsSetting = params.jsSetting();
final var profile = params.profile();
List<EdgeSearchSubquery> subqueries =
new ArrayList<>(processedQuery.specs.subqueries.size() * profile.indexBlocks.size());
for (var sq : processedQuery.specs.subqueries) { for (var sq : processedQuery.specs.subqueries) {
for (var block : profile.indexBlocks) { for (var block : profile.indexBlocks) {
@ -55,28 +66,19 @@ public class QueryFactory {
} }
} }
processedQuery.specs.subqueries.clear(); subqueries.forEach(sq -> {
processedQuery.specs.subqueries.addAll(subqueries); sq.searchTermsExclude.addAll(Arrays.asList(jsSetting.implictExcludeSearchTerms));
processedQuery.specs.subqueries.forEach(sq -> {
sq.searchTermsInclude.addAll(profile.additionalSearchTerm);
if (jsSetting.equals("yes-js")) {
sq.searchTermsExclude.add("js:false");
}
if (jsSetting.equals("no-js")) {
sq.searchTermsExclude.add("js:true");
}
}); });
processedQuery.specs.subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder)); subqueries.sort(Comparator.comparing(sq -> -sq.termSize()*2.3 + sq.block.sortOrder));
return processedQuery; return subqueries;
} }
public EdgeSearchQuery createQuery(QueryParser queryParser, EdgeUserSearchParameters params) { public EdgeSearchQuery createQuery(QueryParser queryParser, EdgeUserSearchParameters params) {
final var query = params.humanQuery; final var query = params.humanQuery();
final var profile = params.getProfile(); final var profile = params.profile();
if (query.length() > 1000) { if (query.length() > 1000) {
Spark.halt(HttpStatus.BAD_REQUEST_400, "That's too much, man"); Spark.halt(HttpStatus.BAD_REQUEST_400, "That's too much, man");

View File

@ -1,12 +1,7 @@
package nu.marginalia.wmsa.edge.search.query.model; package nu.marginalia.wmsa.edge.search.query.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
import nu.marginalia.wmsa.edge.search.command.SearchJsParameter;
@AllArgsConstructor @Getter public record EdgeUserSearchParameters (String humanQuery, EdgeSearchProfile profile, SearchJsParameter jsSetting){
public class EdgeUserSearchParameters {
public final String humanQuery;
public final EdgeSearchProfile profile;
public final String jsSetting;
} }