From 2e740bb7bd0ec6a13fd0813d86ef46e4d622a556 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Wed, 14 Sep 2022 16:31:34 +0200 Subject: [PATCH] Add advisory search terms that do not affect ranking. --- .../edge/index/svc/EdgeIndexQueryService.java | 19 +++- .../edge/model/search/EdgeSearchSubquery.java | 5 +- .../wmsa/edge/search/query/QueryFactory.java | 6 +- .../wmsa/edge/search/query/QueryParser.java | 89 ++++++++++++++----- .../svc/EdgeSearchQueryIndexService.java | 2 +- .../edge/search/query/QueryParserTest.java | 6 ++ 6 files changed, 99 insertions(+), 28 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index ddc7ef69..95af9493 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -168,6 +168,7 @@ public class EdgeIndexQueryService { results.addAll(performSearch(sq)); } + for (var result : results) { addResultScores(result); } @@ -207,15 +208,17 @@ public class EdgeIndexQueryService { final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); if (!budget.hasTimeLeft()) { - logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude); + logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); continue; + } - if (fetchSize <= results.size()) + if (results.size() >= fetchSize) { break; + } IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms); - long[] buf = new long[8192]; + long[] buf = new long[fetchSize]; while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { int cnt = query.getMoreResults(buf, budget); @@ -326,6 +329,16 @@ public class EdgeIndexQueryService { includes.add(word.getAsInt()); } + + for (var advice : request.searchTermsAdvice) { + var word = lookUpWord(advice); + if (word.isEmpty()) { + logger.debug("Unknown search term: " + advice); + return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList()); + } + includes.add(word.getAsInt()); + } + for (var exclude : request.searchTermsExclude) { lookUpWord(exclude).ifPresent(excludes::add); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java index 0270d9a7..2784d495 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSubquery.java @@ -15,13 +15,15 @@ public class EdgeSearchSubquery { public final List searchTermsInclude; public final List searchTermsExclude; + public final List searchTermsAdvice; public final IndexBlock block; private double value = 0; - public EdgeSearchSubquery(List searchTermsInclude, List searchTermsExclude, IndexBlock block) { + public EdgeSearchSubquery(List searchTermsInclude, List searchTermsExclude, List searchTermsAdvice, IndexBlock block) { this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; + this.searchTermsAdvice = searchTermsAdvice; this.block = block; } @@ -29,6 +31,7 @@ public class EdgeSearchSubquery { return new EdgeSearchSubquery( new CopyOnWriteArrayList<>(searchTermsInclude), new CopyOnWriteArrayList<>(searchTermsExclude), + new CopyOnWriteArrayList<>(searchTermsAdvice), block).setValue(value); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index f4c32d13..4bb640fb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -125,12 +125,16 @@ public class QueryFactory { for (var parts : queryPermutations) { List searchTermsExclude = new ArrayList<>(); List searchTermsInclude = new ArrayList<>(); + List searchTermsAdvice = new ArrayList<>(); for (Token t : parts) { switch (t.type) { case EXCLUDE_TERM: searchTermsExclude.add(t.str); break; + case ADVICE_TERM: + searchTermsAdvice.add(t.str); + break; case LITERAL_TERM: // fallthrough; case QUOT_TERM: searchTermsInclude.add(t.str); @@ -144,7 +148,7 @@ public class QueryFactory { } } - EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title); + EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, IndexBlock.Title); params.profile().addTacitTerms(subquery); params.jsSetting().addTacitTerms(subquery); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 98057fd7..749c7e47 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -28,12 +28,14 @@ public class QueryParser { } public List parse(String query) { - List tokens = extractBasicTokens(query); + List basicTokens = extractBasicTokens(query); + List parsedTokens = new ArrayList<>(basicTokens.size()); + + for (int i = 0; i < basicTokens.size(); i++) { + var t = basicTokens.get(i); - for (int i = 0; i < tokens.size(); i++) { - var t = tokens.get(i); if (t.type == TokenType.QUOT) { - tokens.set(i, new Token(TokenType.QUOT_TERM, + parsedTokens.add(new Token(TokenType.QUOT_TERM, t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), t.displayStr)); } @@ -41,26 +43,54 @@ public class QueryParser { && (t.str.endsWith(":")||t.str.endsWith(".")) && t.str.length() > 1) { - tokens.set(i, - new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), - t.displayStr)); + parsedTokens.add(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), t.displayStr)); } } - for (int i = 0; i < tokens.size() - 1; i++) { - var t = tokens.get(i); - var tn = tokens.get(i+1); + for (int i = 0; i < basicTokens.size() - 1; i++) { + var t = basicTokens.get(i); + var tn = basicTokens.get(i+1); - if (t.type == TokenType.MINUS) { - tokens.set(i, new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str)); - tokens.remove(i+1); + if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { + parsedTokens.add(new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str)); + i++; } } - return tokens; + for (int i = 0; i < basicTokens.size(); i++) { + var t = basicTokens.get(i); + + if (t.type == TokenType.LITERAL_TERM) { + parsedTokens.add(t); + continue; + } + else if (t.type != TokenType.LPAREN) { + continue; + } + + int end = i+1; + for (; end < basicTokens.size(); end++) { + if (basicTokens.get(end).type == TokenType.RPAREN) { + break; + } + } + if (end == basicTokens.size()) { + continue; + } + + for (int j = i+1; j < end; j++) { + var tok = basicTokens.get(j); + if (tok.type == TokenType.LITERAL_TERM) { + parsedTokens.add(new Token(TokenType.ADVICE_TERM, tok.str, "(" + tok.str + ")")); + } + } + i = end; + } + + return parsedTokens; } - private static final Pattern noisePattern = Pattern.compile("[(),]"); + private static final Pattern noisePattern = Pattern.compile("[,]"); public List extractBasicTokens(String rawQuery) { List tokens = new ArrayList<>(); @@ -69,7 +99,14 @@ public class QueryParser { for (int i = 0; i < query.length(); i++) { int chr = query.charAt(i); - if ('"' == chr) { + + if ('(' == chr) { + tokens.add(new Token(TokenType.LPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1))); + } + else if (')' == chr) { + tokens.add(new Token(TokenType.RPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1))); + } + else if ('"' == chr) { int end = query.indexOf('"', i+1); if (end == -1) { end = query.length(); @@ -96,14 +133,16 @@ public class QueryParser { // } else { - int end = query.indexOf(' ', i); - if (end == -1) { - end = query.length(); + + int end = i+1; + for (; end < query.length(); end++) { + if (query.charAt(end) == ' ' || query.charAt(end) == ')') + break; } tokens.add(new Token(TokenType.LITERAL_TERM, query.substring(i, end).toLowerCase(), query.substring(i, end))); - i = end; + i = end-1; } } return tokens; @@ -431,9 +470,15 @@ class Token { enum TokenType { TERM, - QUOT, - MINUS, + + LITERAL_TERM, QUOT_TERM, EXCLUDE_TERM, + ADVICE_TERM, + + QUOT, + MINUS, + LPAREN, + RPAREN } \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java index c9fd8092..29983675 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -33,7 +33,7 @@ public class EdgeSearchQueryIndexService { public List performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { List sqs = new ArrayList<>(); - sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); + sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block)); EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java index 5d2a2f83..7defb7d7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryParserTest.java @@ -26,6 +26,12 @@ class QueryParserTest { parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary)); } + @Test + public void testAdviceString() { + System.out.println(parser.parse("alcibiades (plato) \"my query\" -cars")); + System.out.println(parser.parse("universals plato")); + } + @Test void variantQueries() { var r = parser.parse("car stemming");