Merge pull request 'Advisory search terms' (#115) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/115
This commit is contained in:
Viktor Lofgren 2022-09-14 16:32:06 +02:00
commit ba6519ef1d
6 changed files with 99 additions and 28 deletions

View File

@ -168,6 +168,7 @@ public class EdgeIndexQueryService {
results.addAll(performSearch(sq));
}
for (var result : results) {
addResultScores(result);
}
@ -207,15 +208,17 @@ public class EdgeIndexQueryService {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
if (!budget.hasTimeLeft()) {
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
continue;
}
if (fetchSize <= results.size())
if (results.size() >= fetchSize) {
break;
}
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
long[] buf = new long[8192];
long[] buf = new long[fetchSize];
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
int cnt = query.getMoreResults(buf, budget);
@ -326,6 +329,16 @@ public class EdgeIndexQueryService {
includes.add(word.getAsInt());
}
for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + advice);
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}

View File

@ -15,13 +15,15 @@ public class EdgeSearchSubquery {
public final List<String> searchTermsInclude;
public final List<String> searchTermsExclude;
public final List<String> searchTermsAdvice;
public final IndexBlock block;
private double value = 0;
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, IndexBlock block) {
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, IndexBlock block) {
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
this.block = block;
}
@ -29,6 +31,7 @@ public class EdgeSearchSubquery {
return new EdgeSearchSubquery(
new CopyOnWriteArrayList<>(searchTermsInclude),
new CopyOnWriteArrayList<>(searchTermsExclude),
new CopyOnWriteArrayList<>(searchTermsAdvice),
block).setValue(value);
}

View File

@ -125,12 +125,16 @@ public class QueryFactory {
for (var parts : queryPermutations) {
List<String> searchTermsExclude = new ArrayList<>();
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
for (Token t : parts) {
switch (t.type) {
case EXCLUDE_TERM:
searchTermsExclude.add(t.str);
break;
case ADVICE_TERM:
searchTermsAdvice.add(t.str);
break;
case LITERAL_TERM: // fallthrough;
case QUOT_TERM:
searchTermsInclude.add(t.str);
@ -144,7 +148,7 @@ public class QueryFactory {
}
}
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, IndexBlock.Title);
params.profile().addTacitTerms(subquery);
params.jsSetting().addTacitTerms(subquery);

View File

@ -28,12 +28,14 @@ public class QueryParser {
}
public List<Token> parse(String query) {
List<Token> tokens = extractBasicTokens(query);
List<Token> basicTokens = extractBasicTokens(query);
List<Token> parsedTokens = new ArrayList<>(basicTokens.size());
for (int i = 0; i < basicTokens.size(); i++) {
var t = basicTokens.get(i);
for (int i = 0; i < tokens.size(); i++) {
var t = tokens.get(i);
if (t.type == TokenType.QUOT) {
tokens.set(i, new Token(TokenType.QUOT_TERM,
parsedTokens.add(new Token(TokenType.QUOT_TERM,
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
t.displayStr));
}
@ -41,26 +43,54 @@ public class QueryParser {
&& (t.str.endsWith(":")||t.str.endsWith("."))
&& t.str.length() > 1)
{
tokens.set(i,
new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1),
t.displayStr));
parsedTokens.add(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), t.displayStr));
}
}
for (int i = 0; i < tokens.size() - 1; i++) {
var t = tokens.get(i);
var tn = tokens.get(i+1);
for (int i = 0; i < basicTokens.size() - 1; i++) {
var t = basicTokens.get(i);
var tn = basicTokens.get(i+1);
if (t.type == TokenType.MINUS) {
tokens.set(i, new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
tokens.remove(i+1);
if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) {
parsedTokens.add(new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
i++;
}
}
return tokens;
for (int i = 0; i < basicTokens.size(); i++) {
var t = basicTokens.get(i);
if (t.type == TokenType.LITERAL_TERM) {
parsedTokens.add(t);
continue;
}
else if (t.type != TokenType.LPAREN) {
continue;
}
int end = i+1;
for (; end < basicTokens.size(); end++) {
if (basicTokens.get(end).type == TokenType.RPAREN) {
break;
}
}
if (end == basicTokens.size()) {
continue;
}
for (int j = i+1; j < end; j++) {
var tok = basicTokens.get(j);
if (tok.type == TokenType.LITERAL_TERM) {
parsedTokens.add(new Token(TokenType.ADVICE_TERM, tok.str, "(" + tok.str + ")"));
}
}
i = end;
}
return parsedTokens;
}
private static final Pattern noisePattern = Pattern.compile("[(),]");
private static final Pattern noisePattern = Pattern.compile("[,]");
public List<Token> extractBasicTokens(String rawQuery) {
List<Token> tokens = new ArrayList<>();
@ -69,7 +99,14 @@ public class QueryParser {
for (int i = 0; i < query.length(); i++) {
int chr = query.charAt(i);
if ('"' == chr) {
if ('(' == chr) {
tokens.add(new Token(TokenType.LPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
}
else if (')' == chr) {
tokens.add(new Token(TokenType.RPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
}
else if ('"' == chr) {
int end = query.indexOf('"', i+1);
if (end == -1) {
end = query.length();
@ -96,14 +133,16 @@ public class QueryParser {
//
}
else {
int end = query.indexOf(' ', i);
if (end == -1) {
end = query.length();
int end = i+1;
for (; end < query.length(); end++) {
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
break;
}
tokens.add(new Token(TokenType.LITERAL_TERM,
query.substring(i, end).toLowerCase(),
query.substring(i, end)));
i = end;
i = end-1;
}
}
return tokens;
@ -431,9 +470,15 @@ class Token {
enum TokenType {
TERM,
QUOT,
MINUS,
LITERAL_TERM,
QUOT_TERM,
EXCLUDE_TERM,
ADVICE_TERM,
QUOT,
MINUS,
LPAREN,
RPAREN
}

View File

@ -33,7 +33,7 @@ public class EdgeSearchQueryIndexService {
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
List<EdgeSearchSubquery> sqs = new ArrayList<>();
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block));
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048);

View File

@ -26,6 +26,12 @@ class QueryParserTest {
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
}
@Test
public void testAdviceString() {
System.out.println(parser.parse("alcibiades (plato) \"my query\" -cars"));
System.out.println(parser.parse("universals plato"));
}
@Test
void variantQueries() {
var r = parser.parse("car stemming");