mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge pull request 'Advisory search terms' (#115) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/115
This commit is contained in:
commit
ba6519ef1d
@ -168,6 +168,7 @@ public class EdgeIndexQueryService {
|
||||
results.addAll(performSearch(sq));
|
||||
}
|
||||
|
||||
|
||||
for (var result : results) {
|
||||
addResultScores(result);
|
||||
}
|
||||
@ -207,15 +208,17 @@ public class EdgeIndexQueryService {
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||
continue;
|
||||
|
||||
}
|
||||
|
||||
if (fetchSize <= results.size())
|
||||
if (results.size() >= fetchSize) {
|
||||
break;
|
||||
}
|
||||
|
||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||
long[] buf = new long[8192];
|
||||
long[] buf = new long[fetchSize];
|
||||
|
||||
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
||||
int cnt = query.getMoreResults(buf, budget);
|
||||
@ -326,6 +329,16 @@ public class EdgeIndexQueryService {
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
|
||||
for (var advice : request.searchTermsAdvice) {
|
||||
var word = lookUpWord(advice);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + advice);
|
||||
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
for (var exclude : request.searchTermsExclude) {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
@ -15,13 +15,15 @@ public class EdgeSearchSubquery {
|
||||
|
||||
public final List<String> searchTermsInclude;
|
||||
public final List<String> searchTermsExclude;
|
||||
public final List<String> searchTermsAdvice;
|
||||
public final IndexBlock block;
|
||||
|
||||
private double value = 0;
|
||||
|
||||
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, IndexBlock block) {
|
||||
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, IndexBlock block) {
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.block = block;
|
||||
}
|
||||
|
||||
@ -29,6 +31,7 @@ public class EdgeSearchSubquery {
|
||||
return new EdgeSearchSubquery(
|
||||
new CopyOnWriteArrayList<>(searchTermsInclude),
|
||||
new CopyOnWriteArrayList<>(searchTermsExclude),
|
||||
new CopyOnWriteArrayList<>(searchTermsAdvice),
|
||||
block).setValue(value);
|
||||
}
|
||||
|
||||
|
@ -125,12 +125,16 @@ public class QueryFactory {
|
||||
for (var parts : queryPermutations) {
|
||||
List<String> searchTermsExclude = new ArrayList<>();
|
||||
List<String> searchTermsInclude = new ArrayList<>();
|
||||
List<String> searchTermsAdvice = new ArrayList<>();
|
||||
|
||||
for (Token t : parts) {
|
||||
switch (t.type) {
|
||||
case EXCLUDE_TERM:
|
||||
searchTermsExclude.add(t.str);
|
||||
break;
|
||||
case ADVICE_TERM:
|
||||
searchTermsAdvice.add(t.str);
|
||||
break;
|
||||
case LITERAL_TERM: // fallthrough;
|
||||
case QUOT_TERM:
|
||||
searchTermsInclude.add(t.str);
|
||||
@ -144,7 +148,7 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
|
||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, IndexBlock.Title);
|
||||
|
||||
params.profile().addTacitTerms(subquery);
|
||||
params.jsSetting().addTacitTerms(subquery);
|
||||
|
@ -28,12 +28,14 @@ public class QueryParser {
|
||||
}
|
||||
|
||||
public List<Token> parse(String query) {
|
||||
List<Token> tokens = extractBasicTokens(query);
|
||||
List<Token> basicTokens = extractBasicTokens(query);
|
||||
List<Token> parsedTokens = new ArrayList<>(basicTokens.size());
|
||||
|
||||
for (int i = 0; i < basicTokens.size(); i++) {
|
||||
var t = basicTokens.get(i);
|
||||
|
||||
for (int i = 0; i < tokens.size(); i++) {
|
||||
var t = tokens.get(i);
|
||||
if (t.type == TokenType.QUOT) {
|
||||
tokens.set(i, new Token(TokenType.QUOT_TERM,
|
||||
parsedTokens.add(new Token(TokenType.QUOT_TERM,
|
||||
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
|
||||
t.displayStr));
|
||||
}
|
||||
@ -41,26 +43,54 @@ public class QueryParser {
|
||||
&& (t.str.endsWith(":")||t.str.endsWith("."))
|
||||
&& t.str.length() > 1)
|
||||
{
|
||||
tokens.set(i,
|
||||
new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1),
|
||||
t.displayStr));
|
||||
parsedTokens.add(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), t.displayStr));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.size() - 1; i++) {
|
||||
var t = tokens.get(i);
|
||||
var tn = tokens.get(i+1);
|
||||
for (int i = 0; i < basicTokens.size() - 1; i++) {
|
||||
var t = basicTokens.get(i);
|
||||
var tn = basicTokens.get(i+1);
|
||||
|
||||
if (t.type == TokenType.MINUS) {
|
||||
tokens.set(i, new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
|
||||
tokens.remove(i+1);
|
||||
if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) {
|
||||
parsedTokens.add(new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
for (int i = 0; i < basicTokens.size(); i++) {
|
||||
var t = basicTokens.get(i);
|
||||
|
||||
if (t.type == TokenType.LITERAL_TERM) {
|
||||
parsedTokens.add(t);
|
||||
continue;
|
||||
}
|
||||
else if (t.type != TokenType.LPAREN) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int end = i+1;
|
||||
for (; end < basicTokens.size(); end++) {
|
||||
if (basicTokens.get(end).type == TokenType.RPAREN) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (end == basicTokens.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int j = i+1; j < end; j++) {
|
||||
var tok = basicTokens.get(j);
|
||||
if (tok.type == TokenType.LITERAL_TERM) {
|
||||
parsedTokens.add(new Token(TokenType.ADVICE_TERM, tok.str, "(" + tok.str + ")"));
|
||||
}
|
||||
}
|
||||
i = end;
|
||||
}
|
||||
|
||||
return parsedTokens;
|
||||
}
|
||||
|
||||
private static final Pattern noisePattern = Pattern.compile("[(),]");
|
||||
private static final Pattern noisePattern = Pattern.compile("[,]");
|
||||
|
||||
public List<Token> extractBasicTokens(String rawQuery) {
|
||||
List<Token> tokens = new ArrayList<>();
|
||||
@ -69,7 +99,14 @@ public class QueryParser {
|
||||
|
||||
for (int i = 0; i < query.length(); i++) {
|
||||
int chr = query.charAt(i);
|
||||
if ('"' == chr) {
|
||||
|
||||
if ('(' == chr) {
|
||||
tokens.add(new Token(TokenType.LPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
|
||||
}
|
||||
else if (')' == chr) {
|
||||
tokens.add(new Token(TokenType.RPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
|
||||
}
|
||||
else if ('"' == chr) {
|
||||
int end = query.indexOf('"', i+1);
|
||||
if (end == -1) {
|
||||
end = query.length();
|
||||
@ -96,14 +133,16 @@ public class QueryParser {
|
||||
//
|
||||
}
|
||||
else {
|
||||
int end = query.indexOf(' ', i);
|
||||
if (end == -1) {
|
||||
end = query.length();
|
||||
|
||||
int end = i+1;
|
||||
for (; end < query.length(); end++) {
|
||||
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
|
||||
break;
|
||||
}
|
||||
tokens.add(new Token(TokenType.LITERAL_TERM,
|
||||
query.substring(i, end).toLowerCase(),
|
||||
query.substring(i, end)));
|
||||
i = end;
|
||||
i = end-1;
|
||||
}
|
||||
}
|
||||
return tokens;
|
||||
@ -431,9 +470,15 @@ class Token {
|
||||
|
||||
enum TokenType {
|
||||
TERM,
|
||||
QUOT,
|
||||
MINUS,
|
||||
|
||||
|
||||
LITERAL_TERM,
|
||||
QUOT_TERM,
|
||||
EXCLUDE_TERM,
|
||||
ADVICE_TERM,
|
||||
|
||||
QUOT,
|
||||
MINUS,
|
||||
LPAREN,
|
||||
RPAREN
|
||||
}
|
@ -33,7 +33,7 @@ public class EdgeSearchQueryIndexService {
|
||||
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
||||
|
||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block));
|
||||
|
||||
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048);
|
||||
|
||||
|
@ -26,6 +26,12 @@ class QueryParserTest {
|
||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAdviceString() {
|
||||
System.out.println(parser.parse("alcibiades (plato) \"my query\" -cars"));
|
||||
System.out.println(parser.parse("universals plato"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void variantQueries() {
|
||||
var r = parser.parse("car stemming");
|
||||
|
Loading…
Reference in New Issue
Block a user