(query) Always generate an ngram alternative, suppresses generation of multiple identical query branches

This commit is contained in:
Viktor Lofgren 2024-04-19 19:42:30 +02:00
parent 5165cf6d15
commit 64baa41e64
3 changed files with 47 additions and 9 deletions

View File

@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import java.util.*; import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
/** Responsible for expanding a query, that is creating alternative branches of query execution /** Responsible for expanding a query, that is creating alternative branches of query execution
* to increase the number of results * to increase the number of results
@ -23,7 +24,8 @@ public class QueryExpansion {
private final List<ExpansionStrategy> expansionStrategies = List.of( private final List<ExpansionStrategy> expansionStrategies = List.of(
this::joinDashes, this::joinDashes,
this::splitWordNum, this::splitWordNum,
this::joinTerms this::joinTerms,
this::ngramAll
); );
@Inject @Inject
@ -63,6 +65,22 @@ public class QueryExpansion {
} }
public void ngramAll(QWordGraph graph) {
List<QWord> parts = new ArrayList<>();
for (var qw : graph) {
if (qw.isBeg() || qw.isEnd())
continue;
parts.add(qw);
}
if (parts.size() > 1) {
graph.addVariantForSpan(parts.getFirst(), parts.getLast(),
parts.stream().map(QWord::word).collect(Collectors.joining("_")));
}
}
// Turn 'MP3' into 'MP-3' // Turn 'MP3' into 'MP-3'
public void splitWordNum(QWordGraph graph) { public void splitWordNum(QWordGraph graph) {
for (var qw : graph) { for (var qw : graph) {

View File

@ -2,6 +2,8 @@ package nu.marginalia.functions.searchquery.query_parser.model;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import java.util.Objects;
public record QWord( public record QWord(
int ord, int ord,
boolean variant, boolean variant,
@ -48,4 +50,22 @@ public record QWord(
public String toString() { public String toString() {
return STR."q{\{word}}"; return STR."q{\{word}}";
} }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
QWord qWord = (QWord) o;
return variant == qWord.variant && Objects.equals(word, qWord.word) && Objects.equals(stemmed, qWord.stemmed) && Objects.equals(isOriginal(), qWord.isOriginal());
}
@Override
public int hashCode() {
int result = Boolean.hashCode(variant);
result = 31 * result + Objects.hashCode(stemmed);
result = 31 * result + Objects.hashCode(word);
result = 31 * result + Objects.hashCode(isOriginal());
return result;
}
} }

View File

@ -19,8 +19,8 @@ public class QWordGraph implements Iterable<QWord> {
public record QWordGraphLink(QWord from, QWord to) {} public record QWordGraphLink(QWord from, QWord to) {}
private final List<QWordGraphLink> links = new ArrayList<>(); private final List<QWordGraphLink> links = new ArrayList<>();
private final Map<QWord, List<QWord>> fromTo = new HashMap<>(); private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
private final Map<QWord, List<QWord>> toFrom = new HashMap<>(); private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
private int wordId = 0; private int wordId = 0;
@ -79,8 +79,8 @@ public class QWordGraph implements Iterable<QWord> {
public void addLink(QWord from, QWord to) { public void addLink(QWord from, QWord to) {
links.add(new QWordGraphLink(from, to)); links.add(new QWordGraphLink(from, to));
fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to); fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);
toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from); toFrom.computeIfAbsent(to.ord(), k -> new ArrayList<>()).add(from);
} }
public List<QWordGraphLink> links() { public List<QWordGraphLink> links() {
@ -103,20 +103,20 @@ public class QWordGraph implements Iterable<QWord> {
} }
public List<QWord> getNext(QWord word) { public List<QWord> getNext(QWord word) {
return fromTo.getOrDefault(word, List.of()); return fromTo.getOrDefault(word.ord(), List.of());
} }
public List<QWord> getNextOriginal(QWord word) { public List<QWord> getNextOriginal(QWord word) {
return fromTo.getOrDefault(word, List.of()) return fromTo.getOrDefault(word.ord(), List.of())
.stream() .stream()
.filter(QWord::isOriginal) .filter(QWord::isOriginal)
.toList(); .toList();
} }
public List<QWord> getPrev(QWord word) { public List<QWord> getPrev(QWord word) {
return toFrom.getOrDefault(word, List.of()); return toFrom.getOrDefault(word.ord(), List.of());
} }
public List<QWord> getPrevOriginal(QWord word) { public List<QWord> getPrevOriginal(QWord word) {
return toFrom.getOrDefault(word, List.of()) return toFrom.getOrDefault(word.ord(), List.of())
.stream() .stream()
.filter(QWord::isOriginal) .filter(QWord::isOriginal)
.toList(); .toList();