mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
WIP
This commit is contained in:
parent
0bd3365c24
commit
a4b810f511
@ -0,0 +1,7 @@
|
|||||||
|
package nu.marginalia.functions.searchquery.query_parser;
|
||||||
|
|
||||||
|
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
|
||||||
|
|
||||||
|
public interface ExpansionStrategy {
|
||||||
|
void expand(QWordGraph graph);
|
||||||
|
}
|
@ -1,9 +1,9 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
package nu.marginalia.functions.searchquery.query_parser;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
import nu.marginalia.functions.searchquery.query_parser.model.QWord;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
|
||||||
import nu.marginalia.segmentation.NgramLexicon;
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
@ -40,7 +40,7 @@ public class QueryExpansion {
|
|||||||
strategy.expand(graph);
|
strategy.expand(graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return graph;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern dashPattern = Pattern.compile("-");
|
private static final Pattern dashPattern = Pattern.compile("-");
|
@ -1,229 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser;
|
|
||||||
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
|
||||||
import nu.marginalia.language.WordPatterns;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static java.util.stream.Stream.concat;
|
|
||||||
|
|
||||||
public class QueryPermutation {
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private final QueryVariants queryVariants;
|
|
||||||
|
|
||||||
public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?");
|
|
||||||
public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?");
|
|
||||||
|
|
||||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
|
||||||
|
|
||||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
|
||||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
|
||||||
|
|
||||||
public QueryPermutation(QueryVariants queryVariants) {
|
|
||||||
this.queryVariants = queryVariants;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<List<Token>> permuteQueries(List<Token> items) {
|
|
||||||
int start = -1;
|
|
||||||
int end = items.size();
|
|
||||||
|
|
||||||
for (int i = 0; i < items.size(); i++) {
|
|
||||||
var token = items.get(i);
|
|
||||||
|
|
||||||
if (start < 0) {
|
|
||||||
if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) {
|
|
||||||
start = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) {
|
|
||||||
end = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (start >= 0 && end - start > 1) {
|
|
||||||
List<List<Token>> permuteParts = combineSearchTerms(items.subList(start, end));
|
|
||||||
int s = start;
|
|
||||||
int e = end;
|
|
||||||
return permuteParts.stream().map(part ->
|
|
||||||
concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream()))
|
|
||||||
.collect(Collectors.toList()))
|
|
||||||
.peek(lst -> lst.removeIf(this::isJunkWord))
|
|
||||||
.limit(24)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return List.of(items);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public List<List<Token>> permuteQueriesNew(List<Token> items) {
|
|
||||||
int start = -1;
|
|
||||||
int end = items.size();
|
|
||||||
|
|
||||||
for (int i = 0; i < items.size(); i++) {
|
|
||||||
var token = items.get(i);
|
|
||||||
|
|
||||||
if (start < 0) {
|
|
||||||
if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) {
|
|
||||||
start = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) {
|
|
||||||
end = i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (start >= 0 && end - start >= 1) {
|
|
||||||
var result = queryVariants.getQueryVariants(items.subList(start, end));
|
|
||||||
|
|
||||||
logger.debug("{}", result);
|
|
||||||
|
|
||||||
if (result.isEmpty()) {
|
|
||||||
logger.warn("Empty variants result, falling back on old code");
|
|
||||||
return permuteQueries(items);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<List<Token>> queryVariants = new ArrayList<>();
|
|
||||||
for (var query : result.faithful) {
|
|
||||||
var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList());
|
|
||||||
tokens.addAll(result.nonLiterals);
|
|
||||||
|
|
||||||
queryVariants.add(tokens);
|
|
||||||
}
|
|
||||||
for (var query : result.alternative) {
|
|
||||||
if (queryVariants.size() >= 6)
|
|
||||||
break;
|
|
||||||
|
|
||||||
var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList());
|
|
||||||
tokens.addAll(result.nonLiterals);
|
|
||||||
|
|
||||||
queryVariants.add(tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<List<Token>> returnValue = new ArrayList<>(queryVariants.size());
|
|
||||||
for (var variant: queryVariants) {
|
|
||||||
List<Token> r = new ArrayList<>(start + variant.size() + (items.size() - end));
|
|
||||||
r.addAll(items.subList(0, start));
|
|
||||||
r.addAll(variant);
|
|
||||||
r.addAll(items.subList(end, items.size()));
|
|
||||||
returnValue.add(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
return returnValue;
|
|
||||||
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return List.of(items);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isJunkWord(Token token) {
|
|
||||||
if (WordPatterns.isStopWord(token.str) &&
|
|
||||||
!token.str.matches("^(\\d+|([a-z]+:.*))$")) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return switch (token.str) {
|
|
||||||
case "vs", "versus", "or", "and" -> true;
|
|
||||||
default -> false;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<List<Token>> combineSearchTerms(List<Token> subList) {
|
|
||||||
int size = subList.size();
|
|
||||||
if (size < 1) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
else if (size == 1) {
|
|
||||||
if (WordPatterns.isStopWord(subList.get(0).str)) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
return List.of(subList);
|
|
||||||
}
|
|
||||||
|
|
||||||
List<List<Token>> results = new ArrayList<>(size*(size+1)/2);
|
|
||||||
|
|
||||||
if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) {
|
|
||||||
results.add(List.of(joinTokens(subList)));
|
|
||||||
}
|
|
||||||
outer: for (int i = size - 1; i >= 1; i--) {
|
|
||||||
|
|
||||||
var left = combineSearchTerms(subList.subList(0, i));
|
|
||||||
var right = combineSearchTerms(subList.subList(i, size));
|
|
||||||
|
|
||||||
for (var l : left) {
|
|
||||||
if (results.size() > 48) {
|
|
||||||
break outer;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var r : right) {
|
|
||||||
if (results.size() > 48) {
|
|
||||||
break outer;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<Token> combined = new ArrayList<>(l.size() + r.size());
|
|
||||||
combined.addAll(l);
|
|
||||||
combined.addAll(r);
|
|
||||||
if (!results.contains(combined)) {
|
|
||||||
results.add(combined);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!results.contains(subList)) {
|
|
||||||
results.add(subList);
|
|
||||||
}
|
|
||||||
Comparator<List<Token>> tc = (o1, o2) -> {
|
|
||||||
int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() -
|
|
||||||
o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum();
|
|
||||||
if (dJoininess == 0) {
|
|
||||||
return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() -
|
|
||||||
o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum());
|
|
||||||
}
|
|
||||||
return (int) Math.signum(dJoininess);
|
|
||||||
};
|
|
||||||
results.sort(tc);
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isPrefixWord(String str) {
|
|
||||||
return switch (str) {
|
|
||||||
case "the", "of", "when" -> true;
|
|
||||||
default -> false;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
int joininess(String s) {
|
|
||||||
return (int) s.chars().filter(c -> c == '_').count();
|
|
||||||
}
|
|
||||||
int rightiness(String s) {
|
|
||||||
int rightiness = 0;
|
|
||||||
for (int i = 0; i < s.length(); i++) {
|
|
||||||
if (s.charAt(i) == '_') {
|
|
||||||
rightiness+=i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rightiness;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Token joinTokens(List<Token> subList) {
|
|
||||||
return new Token(TokenType.LITERAL_TERM,
|
|
||||||
subList.stream().map(t -> t.str).collect(Collectors.joining("_")),
|
|
||||||
subList.stream().map(t -> t.str).collect(Collectors.joining(" ")));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,207 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser;
|
|
||||||
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
|
||||||
import nu.marginalia.util.language.EnglishDictionary;
|
|
||||||
import nu.marginalia.LanguageModels;
|
|
||||||
import nu.marginalia.keyword.KeywordExtractor;
|
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
|
||||||
import nu.marginalia.language.model.WordSpan;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class QueryVariants {
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
|
||||||
private final TermFrequencyDict dict;
|
|
||||||
|
|
||||||
private final EnglishDictionary englishDictionary;
|
|
||||||
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
|
|
||||||
|
|
||||||
public QueryVariants(LanguageModels lm,
|
|
||||||
TermFrequencyDict dict,
|
|
||||||
EnglishDictionary englishDictionary) {
|
|
||||||
this.englishDictionary = englishDictionary;
|
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
|
||||||
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
|
|
||||||
this.dict = dict;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public QueryVariantSet getQueryVariants(List<Token> query) {
|
|
||||||
final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query);
|
|
||||||
|
|
||||||
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
|
|
||||||
|
|
||||||
var se = sentenceExtractor.get();
|
|
||||||
var sentence = se.extractSentence(joinedQuery.joinedQuery);
|
|
||||||
|
|
||||||
for (int i = 0; i < sentence.posTags.length; i++) {
|
|
||||||
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {
|
|
||||||
sentence.posTags[i] = "NNP";
|
|
||||||
}
|
|
||||||
else if ("JJ".equals(sentence.posTags[i]) || "CD".equals(sentence.posTags[i]) || sentence.posTags[i].startsWith("P")) {
|
|
||||||
sentence.posTags[i] = "NNP";
|
|
||||||
sentence.setIsStopWord(i, false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var kw : keywordExtractor.getKeywordsFromSentence(sentence)) {
|
|
||||||
byStart.computeIfAbsent(kw.start, k -> new ArrayList<>()).add(kw);
|
|
||||||
}
|
|
||||||
|
|
||||||
final List<ArrayList<WordSpan>> livingSpans = new ArrayList<>();
|
|
||||||
|
|
||||||
var first = byStart.firstEntry();
|
|
||||||
if (first == null) {
|
|
||||||
var span = new WordSpan(0, sentence.length());
|
|
||||||
byStart.put(0, List.of(span));
|
|
||||||
}
|
|
||||||
else if (first.getKey() > 0) {
|
|
||||||
List<WordSpan> elongatedFirstWords = new ArrayList<>(first.getValue().size());
|
|
||||||
|
|
||||||
first.getValue().forEach(span -> {
|
|
||||||
elongatedFirstWords.add(new WordSpan(0, span.start));
|
|
||||||
elongatedFirstWords.add(new WordSpan(0, span.end));
|
|
||||||
});
|
|
||||||
|
|
||||||
byStart.put(0, elongatedFirstWords);
|
|
||||||
}
|
|
||||||
|
|
||||||
final List<List<QueryWord>> goodSpans = getWordSpans(byStart, sentence, livingSpans);
|
|
||||||
|
|
||||||
List<List<String>> faithfulQueries = new ArrayList<>();
|
|
||||||
List<List<String>> alternativeQueries = new ArrayList<>();
|
|
||||||
|
|
||||||
for (var ls : goodSpans) {
|
|
||||||
var last = ls.get(ls.size() - 1);
|
|
||||||
|
|
||||||
if (!last.wordOriginal.isBlank() && !Character.isUpperCase(last.wordOriginal.charAt(0))) {
|
|
||||||
var altLast = englishDictionary.getWordVariants(last.word);
|
|
||||||
for (String s : altLast) {
|
|
||||||
List<String> newList = new ArrayList<>(ls.size());
|
|
||||||
for (int i = 0; i < ls.size() - 1; i++) {
|
|
||||||
newList.add(ls.get(i).word);
|
|
||||||
}
|
|
||||||
newList.add(s);
|
|
||||||
alternativeQueries.add(newList);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
QueryVariantSet returnValue = new QueryVariantSet();
|
|
||||||
|
|
||||||
returnValue.faithful.addAll(evaluateQueries(faithfulQueries));
|
|
||||||
returnValue.alternative.addAll(evaluateQueries(alternativeQueries));
|
|
||||||
|
|
||||||
returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue));
|
|
||||||
returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue));
|
|
||||||
|
|
||||||
returnValue.nonLiterals.addAll(joinedQuery.nonLiterals);
|
|
||||||
|
|
||||||
return returnValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Pattern underscore = Pattern.compile("_");
|
|
||||||
|
|
||||||
private List<QueryVariant> evaluateQueries(List<List<String>> queryStrings) {
|
|
||||||
Set<QueryVariant> variantsSet = new HashSet<>();
|
|
||||||
List<QueryVariant> ret = new ArrayList<>();
|
|
||||||
for (var lst : queryStrings) {
|
|
||||||
double q = 0;
|
|
||||||
for (var word : lst) {
|
|
||||||
String[] parts = underscore.split(word);
|
|
||||||
double qp = 0;
|
|
||||||
for (String part : parts) {
|
|
||||||
qp += 1./(1+ dict.getTermFreq(part));
|
|
||||||
}
|
|
||||||
q += 1.0 / qp;
|
|
||||||
}
|
|
||||||
var qv = new QueryVariant(lst, q);
|
|
||||||
if (variantsSet.add(qv)) {
|
|
||||||
ret.add(qv);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<List<QueryWord>> getWordSpans(TreeMap<Integer, List<WordSpan>> byStart, DocumentSentence sentence, List<ArrayList<WordSpan>> livingSpans) {
|
|
||||||
List<List<QueryWord>> goodSpans = new ArrayList<>();
|
|
||||||
for (int i = 0; i < 1; i++) {
|
|
||||||
var spans = byStart.get(i);
|
|
||||||
|
|
||||||
|
|
||||||
if (spans == null )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (var span : spans) {
|
|
||||||
ArrayList<WordSpan> fragment = new ArrayList<>();
|
|
||||||
fragment.add(span);
|
|
||||||
livingSpans.add(fragment);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
while (!livingSpans.isEmpty()) {
|
|
||||||
|
|
||||||
final List<ArrayList<WordSpan>> newLivingSpans = new ArrayList<>(livingSpans.size());
|
|
||||||
|
|
||||||
for (var span : livingSpans) {
|
|
||||||
int end = span.get(span.size()-1).end;
|
|
||||||
|
|
||||||
if (end == sentence.length()) {
|
|
||||||
var gs = new ArrayList<QueryWord>(span.size());
|
|
||||||
for (var s : span) {
|
|
||||||
gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s),
|
|
||||||
s.size() == 1 ? sentence.words[s.start] : ""));
|
|
||||||
}
|
|
||||||
goodSpans.add(gs);
|
|
||||||
}
|
|
||||||
var nextWordsKey = byStart.ceilingKey(end);
|
|
||||||
|
|
||||||
if (null == nextWordsKey)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (var next : byStart.get(nextWordsKey)) {
|
|
||||||
var newSpan = new ArrayList<WordSpan>(span.size() + 1);
|
|
||||||
newSpan.addAll(span);
|
|
||||||
newSpan.add(next);
|
|
||||||
newLivingSpans.add(newSpan);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
livingSpans.clear();
|
|
||||||
livingSpans.addAll(newLivingSpans);
|
|
||||||
}
|
|
||||||
|
|
||||||
return goodSpans;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private JoinedQueryAndNonLiteralTokens joinQuery(List<Token> query) {
|
|
||||||
StringJoiner s = new StringJoiner(" ");
|
|
||||||
List<Token> leftovers = new ArrayList<>(5);
|
|
||||||
|
|
||||||
for (var t : query) {
|
|
||||||
if (t.type == TokenType.LITERAL_TERM) {
|
|
||||||
s.add(t.displayStr);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
leftovers.add(t);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new JoinedQueryAndNonLiteralTokens(s.toString(), leftovers);
|
|
||||||
}
|
|
||||||
|
|
||||||
record JoinedQueryAndNonLiteralTokens(String joinedQuery, List<Token> nonLiterals) {}
|
|
||||||
}
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.model;
|
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.model;
|
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||||
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@ -15,8 +15,7 @@ import java.util.stream.Stream;
|
|||||||
public class QWordGraph implements Iterable<QWord> {
|
public class QWordGraph implements Iterable<QWord> {
|
||||||
|
|
||||||
|
|
||||||
public record QWordGraphLink(QWord from, QWord to) {
|
public record QWordGraphLink(QWord from, QWord to) {}
|
||||||
}
|
|
||||||
|
|
||||||
private final List<QWordGraphLink> links = new ArrayList<>();
|
private final List<QWordGraphLink> links = new ArrayList<>();
|
||||||
private final Map<QWord, List<QWord>> fromTo = new HashMap<>();
|
private final Map<QWord, List<QWord>> fromTo = new HashMap<>();
|
||||||
@ -121,8 +120,6 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
// understanding which vertexes can be re-ordered without changing
|
// understanding which vertexes can be re-ordered without changing
|
||||||
// the semantics of the encoded query.
|
// the semantics of the encoded query.
|
||||||
public boolean isBypassed(QWord word, QWord begin, QWord end) {
|
public boolean isBypassed(QWord word, QWord begin, QWord end) {
|
||||||
assert word.isOriginal() : "Can only bypass original words";
|
|
||||||
|
|
||||||
Set<QWord> edge = new HashSet<>();
|
Set<QWord> edge = new HashSet<>();
|
||||||
Set<QWord> visited = new HashSet<>();
|
Set<QWord> visited = new HashSet<>();
|
||||||
|
|
||||||
@ -163,6 +160,7 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
List<QWord> edge = new ArrayList<>();
|
List<QWord> edge = new ArrayList<>();
|
||||||
List<QWord> visited = new ArrayList<>();
|
List<QWord> visited = new ArrayList<>();
|
||||||
|
|
||||||
|
visited.add(begin);
|
||||||
edge.add(begin);
|
edge.add(begin);
|
||||||
|
|
||||||
while (!edge.isEmpty()) {
|
while (!edge.isEmpty()) {
|
||||||
@ -172,7 +170,9 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
if (Objects.equals(w, end))
|
if (Objects.equals(w, end))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex";
|
if (w.isEnd()) {
|
||||||
|
assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end;
|
||||||
|
}
|
||||||
|
|
||||||
next.addAll(getNext(w));
|
next.addAll(getNext(w));
|
||||||
}
|
}
|
||||||
@ -182,7 +182,7 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
edge = next;
|
edge = next;
|
||||||
}
|
}
|
||||||
|
|
||||||
return visited;
|
return visited.stream().distinct().toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a list of subgraphs that are connected on the path from
|
/** Returns a list of subgraphs that are connected on the path from
|
||||||
@ -201,7 +201,7 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
|
|
||||||
List<QWord> points = nodesBetween(begin, end)
|
List<QWord> points = nodesBetween(begin, end)
|
||||||
.stream()
|
.stream()
|
||||||
.filter(w -> isBypassed(w, begin, end))
|
.filter(w -> !isBypassed(w, begin, end))
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
for (int i = 0; i < points.size() - 1; i++) {
|
for (int i = 0; i < points.size() - 1; i++) {
|
||||||
@ -214,6 +214,36 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
return subgraphs;
|
return subgraphs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String compileToQuery() {
|
||||||
|
return compileToQuery(QWord.beg(), QWord.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String compileToQuery(QWord begin, QWord end) {
|
||||||
|
StringJoiner sj = new StringJoiner(" ");
|
||||||
|
|
||||||
|
for (var subgraph : getSubgraphs(begin, end)) {
|
||||||
|
if (getNext(subgraph.from).equals(List.of(subgraph.to))) {
|
||||||
|
if (subgraph.from.isBeg())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
sj.add(subgraph.from.word());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )");
|
||||||
|
if (Objects.equals(subgraph.from, begin)) {
|
||||||
|
for (QWord path : getNext(subgraph.from)) {
|
||||||
|
branchJoiner.add(compileToQuery(path, subgraph.to));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
branchJoiner.add(compileToQuery(subgraph.from, subgraph.to));
|
||||||
|
}
|
||||||
|
sj.add(branchJoiner.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
@Override
|
@ -1,7 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
|
||||||
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
|
||||||
|
|
||||||
public interface ExpansionStrategy {
|
|
||||||
void expand(QWordGraph graph);
|
|
||||||
}
|
|
@ -1,17 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
|
||||||
@Getter
|
|
||||||
@ToString
|
|
||||||
@EqualsAndHashCode
|
|
||||||
public class QueryVariant {
|
|
||||||
public final List<String> terms;
|
|
||||||
public final double value;
|
|
||||||
}
|
|
@ -1,21 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Getter
|
|
||||||
@ToString
|
|
||||||
public class QueryVariantSet {
|
|
||||||
public final List<QueryVariant> faithful = new ArrayList<>();
|
|
||||||
public final List<QueryVariant> alternative = new ArrayList<>();
|
|
||||||
|
|
||||||
public final List<Token> nonLiterals = new ArrayList<>();
|
|
||||||
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,10 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class QueryWord {
|
|
||||||
public final String stemmed;
|
|
||||||
public final String word;
|
|
||||||
public final String wordOriginal;
|
|
||||||
}
|
|
@ -1,8 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public interface VariantStrategy {
|
|
||||||
Collection<? extends List<String>> constructVariants(List<QueryWord> ls);
|
|
||||||
}
|
|
@ -11,8 +11,6 @@ import nu.marginalia.language.WordPatterns;
|
|||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryPermutation;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.QueryVariants;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
@ -29,43 +27,19 @@ public class QueryFactory {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
||||||
private final ThreadLocal<QueryVariants> queryVariants;
|
|
||||||
private final QueryParser queryParser = new QueryParser();
|
private final QueryParser queryParser = new QueryParser();
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryFactory(LanguageModels lm,
|
public QueryFactory(LanguageModels lm,
|
||||||
TermFrequencyDict dict,
|
TermFrequencyDict dict,
|
||||||
EnglishDictionary englishDictionary) {
|
EnglishDictionary englishDictionary)
|
||||||
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary));
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public QueryPermutation getQueryPermutation() {
|
|
||||||
return new QueryPermutation(queryVariants.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
public ProcessedQuery createQuery(QueryParams params) {
|
public ProcessedQuery createQuery(QueryParams params) {
|
||||||
final var processedQuery = createQuery(getQueryPermutation(), params);
|
|
||||||
final List<SearchSubquery> subqueries = processedQuery.specs.subqueries;
|
|
||||||
|
|
||||||
// There used to be a piece of logic here that would try to figure out which one of these subqueries were the "best",
|
|
||||||
// it's gone for the moment, but it would be neat if it resurrected somehow
|
|
||||||
|
|
||||||
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
|
|
||||||
|
|
||||||
return processedQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void trimArray(List<?> arr, int maxSize) {
|
|
||||||
if (arr.size() > maxSize) {
|
|
||||||
arr.subList(0, arr.size() - maxSize).clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public ProcessedQuery createQuery(QueryPermutation queryPermutation,
|
|
||||||
QueryParams params)
|
|
||||||
{
|
|
||||||
final var query = params.humanQuery();
|
final var query = params.humanQuery();
|
||||||
|
|
||||||
if (query.length() > 1000) {
|
if (query.length() > 1000) {
|
||||||
@ -100,17 +74,19 @@ public class QueryFactory {
|
|||||||
t.visit(qualityLimits);
|
t.visit(qualityLimits);
|
||||||
}
|
}
|
||||||
|
|
||||||
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
||||||
List<SearchSubquery> subqueries = new ArrayList<>();
|
List<SearchSubquery> subqueries = new ArrayList<>();
|
||||||
|
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||||
for (var parts : queryPermutations) {
|
|
||||||
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts);
|
|
||||||
|
|
||||||
domain = termsAccumulator.domain;
|
domain = termsAccumulator.domain;
|
||||||
|
|
||||||
SearchSubquery subquery = termsAccumulator.createSubquery();
|
// for (var parts : queryPermutations) {
|
||||||
subqueries.add(subquery);
|
// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||||
}
|
//
|
||||||
|
// domain = termsAccumulator.domain;
|
||||||
|
//
|
||||||
|
// SearchSubquery subquery = termsAccumulator.createSubquery();
|
||||||
|
// subqueries.add(subquery);
|
||||||
|
// }
|
||||||
|
|
||||||
List<Integer> domainIds = params.domainIds();
|
List<Integer> domainIds = params.domainIds();
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.model;
|
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -10,11 +10,13 @@ class QWordGraphTest {
|
|||||||
|
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
||||||
|
System.out.println(graph.compileToQuery());
|
||||||
graph.links().forEach(System.out::println);
|
graph.links().forEach(System.out::println);
|
||||||
System.out.println("--");
|
System.out.println("--");
|
||||||
graph.nodes().forEach(System.out::println);
|
graph.nodes().forEach(System.out::println);
|
||||||
System.out.println("--");
|
System.out.println("--");
|
||||||
graph.addVariant(graph.nodes().get(1), "sup");
|
graph.addVariant(graph.nodes().get(1), "sup");
|
||||||
|
System.out.println(graph.compileToQuery());
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
||||||
System.out.println("--");
|
System.out.println("--");
|
||||||
@ -23,6 +25,8 @@ class QWordGraphTest {
|
|||||||
graph.nodes().forEach(System.out::println);
|
graph.nodes().forEach(System.out::println);
|
||||||
|
|
||||||
graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall");
|
graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall");
|
||||||
|
graph.addVariant(graph.nodes().get(2), "globe");
|
||||||
|
System.out.println(graph.compileToQuery());
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end()));
|
||||||
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end()));
|
||||||
System.out.println("--");
|
System.out.println("--");
|
Loading…
Reference in New Issue
Block a user