mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Merge pull request 'Advisory search terms' (#115) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/115
This commit is contained in:
commit
ba6519ef1d
@ -168,6 +168,7 @@ public class EdgeIndexQueryService {
|
|||||||
results.addAll(performSearch(sq));
|
results.addAll(performSearch(sq));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (var result : results) {
|
for (var result : results) {
|
||||||
addResultScores(result);
|
addResultScores(result);
|
||||||
}
|
}
|
||||||
@ -207,15 +208,17 @@ public class EdgeIndexQueryService {
|
|||||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||||
|
|
||||||
if (!budget.hasTimeLeft()) {
|
if (!budget.hasTimeLeft()) {
|
||||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fetchSize <= results.size())
|
if (results.size() >= fetchSize) {
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||||
long[] buf = new long[8192];
|
long[] buf = new long[fetchSize];
|
||||||
|
|
||||||
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
||||||
int cnt = query.getMoreResults(buf, budget);
|
int cnt = query.getMoreResults(buf, budget);
|
||||||
@ -326,6 +329,16 @@ public class EdgeIndexQueryService {
|
|||||||
includes.add(word.getAsInt());
|
includes.add(word.getAsInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (var advice : request.searchTermsAdvice) {
|
||||||
|
var word = lookUpWord(advice);
|
||||||
|
if (word.isEmpty()) {
|
||||||
|
logger.debug("Unknown search term: " + advice);
|
||||||
|
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
||||||
|
}
|
||||||
|
includes.add(word.getAsInt());
|
||||||
|
}
|
||||||
|
|
||||||
for (var exclude : request.searchTermsExclude) {
|
for (var exclude : request.searchTermsExclude) {
|
||||||
lookUpWord(exclude).ifPresent(excludes::add);
|
lookUpWord(exclude).ifPresent(excludes::add);
|
||||||
}
|
}
|
||||||
|
@ -15,13 +15,15 @@ public class EdgeSearchSubquery {
|
|||||||
|
|
||||||
public final List<String> searchTermsInclude;
|
public final List<String> searchTermsInclude;
|
||||||
public final List<String> searchTermsExclude;
|
public final List<String> searchTermsExclude;
|
||||||
|
public final List<String> searchTermsAdvice;
|
||||||
public final IndexBlock block;
|
public final IndexBlock block;
|
||||||
|
|
||||||
private double value = 0;
|
private double value = 0;
|
||||||
|
|
||||||
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, IndexBlock block) {
|
public EdgeSearchSubquery(List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, IndexBlock block) {
|
||||||
this.searchTermsInclude = searchTermsInclude;
|
this.searchTermsInclude = searchTermsInclude;
|
||||||
this.searchTermsExclude = searchTermsExclude;
|
this.searchTermsExclude = searchTermsExclude;
|
||||||
|
this.searchTermsAdvice = searchTermsAdvice;
|
||||||
this.block = block;
|
this.block = block;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -29,6 +31,7 @@ public class EdgeSearchSubquery {
|
|||||||
return new EdgeSearchSubquery(
|
return new EdgeSearchSubquery(
|
||||||
new CopyOnWriteArrayList<>(searchTermsInclude),
|
new CopyOnWriteArrayList<>(searchTermsInclude),
|
||||||
new CopyOnWriteArrayList<>(searchTermsExclude),
|
new CopyOnWriteArrayList<>(searchTermsExclude),
|
||||||
|
new CopyOnWriteArrayList<>(searchTermsAdvice),
|
||||||
block).setValue(value);
|
block).setValue(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,12 +125,16 @@ public class QueryFactory {
|
|||||||
for (var parts : queryPermutations) {
|
for (var parts : queryPermutations) {
|
||||||
List<String> searchTermsExclude = new ArrayList<>();
|
List<String> searchTermsExclude = new ArrayList<>();
|
||||||
List<String> searchTermsInclude = new ArrayList<>();
|
List<String> searchTermsInclude = new ArrayList<>();
|
||||||
|
List<String> searchTermsAdvice = new ArrayList<>();
|
||||||
|
|
||||||
for (Token t : parts) {
|
for (Token t : parts) {
|
||||||
switch (t.type) {
|
switch (t.type) {
|
||||||
case EXCLUDE_TERM:
|
case EXCLUDE_TERM:
|
||||||
searchTermsExclude.add(t.str);
|
searchTermsExclude.add(t.str);
|
||||||
break;
|
break;
|
||||||
|
case ADVICE_TERM:
|
||||||
|
searchTermsAdvice.add(t.str);
|
||||||
|
break;
|
||||||
case LITERAL_TERM: // fallthrough;
|
case LITERAL_TERM: // fallthrough;
|
||||||
case QUOT_TERM:
|
case QUOT_TERM:
|
||||||
searchTermsInclude.add(t.str);
|
searchTermsInclude.add(t.str);
|
||||||
@ -144,7 +148,7 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, IndexBlock.Title);
|
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, IndexBlock.Title);
|
||||||
|
|
||||||
params.profile().addTacitTerms(subquery);
|
params.profile().addTacitTerms(subquery);
|
||||||
params.jsSetting().addTacitTerms(subquery);
|
params.jsSetting().addTacitTerms(subquery);
|
||||||
|
@ -28,12 +28,14 @@ public class QueryParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<Token> parse(String query) {
|
public List<Token> parse(String query) {
|
||||||
List<Token> tokens = extractBasicTokens(query);
|
List<Token> basicTokens = extractBasicTokens(query);
|
||||||
|
List<Token> parsedTokens = new ArrayList<>(basicTokens.size());
|
||||||
|
|
||||||
|
for (int i = 0; i < basicTokens.size(); i++) {
|
||||||
|
var t = basicTokens.get(i);
|
||||||
|
|
||||||
for (int i = 0; i < tokens.size(); i++) {
|
|
||||||
var t = tokens.get(i);
|
|
||||||
if (t.type == TokenType.QUOT) {
|
if (t.type == TokenType.QUOT) {
|
||||||
tokens.set(i, new Token(TokenType.QUOT_TERM,
|
parsedTokens.add(new Token(TokenType.QUOT_TERM,
|
||||||
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
|
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
|
||||||
t.displayStr));
|
t.displayStr));
|
||||||
}
|
}
|
||||||
@ -41,26 +43,54 @@ public class QueryParser {
|
|||||||
&& (t.str.endsWith(":")||t.str.endsWith("."))
|
&& (t.str.endsWith(":")||t.str.endsWith("."))
|
||||||
&& t.str.length() > 1)
|
&& t.str.length() > 1)
|
||||||
{
|
{
|
||||||
tokens.set(i,
|
parsedTokens.add(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1), t.displayStr));
|
||||||
new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length()-1),
|
|
||||||
t.displayStr));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < tokens.size() - 1; i++) {
|
for (int i = 0; i < basicTokens.size() - 1; i++) {
|
||||||
var t = tokens.get(i);
|
var t = basicTokens.get(i);
|
||||||
var tn = tokens.get(i+1);
|
var tn = basicTokens.get(i+1);
|
||||||
|
|
||||||
if (t.type == TokenType.MINUS) {
|
if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) {
|
||||||
tokens.set(i, new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
|
parsedTokens.add(new Token(TokenType.EXCLUDE_TERM, tn.str, "-"+tn.str));
|
||||||
tokens.remove(i+1);
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens;
|
for (int i = 0; i < basicTokens.size(); i++) {
|
||||||
|
var t = basicTokens.get(i);
|
||||||
|
|
||||||
|
if (t.type == TokenType.LITERAL_TERM) {
|
||||||
|
parsedTokens.add(t);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (t.type != TokenType.LPAREN) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int end = i+1;
|
||||||
|
for (; end < basicTokens.size(); end++) {
|
||||||
|
if (basicTokens.get(end).type == TokenType.RPAREN) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (end == basicTokens.size()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = i+1; j < end; j++) {
|
||||||
|
var tok = basicTokens.get(j);
|
||||||
|
if (tok.type == TokenType.LITERAL_TERM) {
|
||||||
|
parsedTokens.add(new Token(TokenType.ADVICE_TERM, tok.str, "(" + tok.str + ")"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
return parsedTokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern noisePattern = Pattern.compile("[(),]");
|
private static final Pattern noisePattern = Pattern.compile("[,]");
|
||||||
|
|
||||||
public List<Token> extractBasicTokens(String rawQuery) {
|
public List<Token> extractBasicTokens(String rawQuery) {
|
||||||
List<Token> tokens = new ArrayList<>();
|
List<Token> tokens = new ArrayList<>();
|
||||||
@ -69,7 +99,14 @@ public class QueryParser {
|
|||||||
|
|
||||||
for (int i = 0; i < query.length(); i++) {
|
for (int i = 0; i < query.length(); i++) {
|
||||||
int chr = query.charAt(i);
|
int chr = query.charAt(i);
|
||||||
if ('"' == chr) {
|
|
||||||
|
if ('(' == chr) {
|
||||||
|
tokens.add(new Token(TokenType.LPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
|
||||||
|
}
|
||||||
|
else if (')' == chr) {
|
||||||
|
tokens.add(new Token(TokenType.RPAREN, query.substring(i, i+1).toLowerCase(), query.substring(i, i+1)));
|
||||||
|
}
|
||||||
|
else if ('"' == chr) {
|
||||||
int end = query.indexOf('"', i+1);
|
int end = query.indexOf('"', i+1);
|
||||||
if (end == -1) {
|
if (end == -1) {
|
||||||
end = query.length();
|
end = query.length();
|
||||||
@ -96,14 +133,16 @@ public class QueryParser {
|
|||||||
//
|
//
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int end = query.indexOf(' ', i);
|
|
||||||
if (end == -1) {
|
int end = i+1;
|
||||||
end = query.length();
|
for (; end < query.length(); end++) {
|
||||||
|
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
tokens.add(new Token(TokenType.LITERAL_TERM,
|
tokens.add(new Token(TokenType.LITERAL_TERM,
|
||||||
query.substring(i, end).toLowerCase(),
|
query.substring(i, end).toLowerCase(),
|
||||||
query.substring(i, end)));
|
query.substring(i, end)));
|
||||||
i = end;
|
i = end-1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return tokens;
|
return tokens;
|
||||||
@ -431,9 +470,15 @@ class Token {
|
|||||||
|
|
||||||
enum TokenType {
|
enum TokenType {
|
||||||
TERM,
|
TERM,
|
||||||
QUOT,
|
|
||||||
MINUS,
|
|
||||||
LITERAL_TERM,
|
LITERAL_TERM,
|
||||||
QUOT_TERM,
|
QUOT_TERM,
|
||||||
EXCLUDE_TERM,
|
EXCLUDE_TERM,
|
||||||
|
ADVICE_TERM,
|
||||||
|
|
||||||
|
QUOT,
|
||||||
|
MINUS,
|
||||||
|
LPAREN,
|
||||||
|
RPAREN
|
||||||
}
|
}
|
@ -33,7 +33,7 @@ public class EdgeSearchQueryIndexService {
|
|||||||
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
||||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
||||||
|
|
||||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), block));
|
||||||
|
|
||||||
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048);
|
EdgeSearchSpecification specs = new EdgeSearchSpecification(profile.buckets, sqs, limitPerDomain, limitTotal, "", 150, 2048);
|
||||||
|
|
||||||
|
@ -26,6 +26,12 @@ class QueryParserTest {
|
|||||||
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
parser = new QueryParser(englishDictionary, new QueryVariants(lm, dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testAdviceString() {
|
||||||
|
System.out.println(parser.parse("alcibiades (plato) \"my query\" -cars"));
|
||||||
|
System.out.println(parser.parse("universals plato"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void variantQueries() {
|
void variantQueries() {
|
||||||
var r = parser.parse("car stemming");
|
var r = parser.parse("car stemming");
|
||||||
|
Loading…
Reference in New Issue
Block a user