(query) Always generate an ngram alternative, suppresses generation of multiple identical query branches

This commit is contained in:
Viktor Lofgren 2024-04-19 19:42:30 +02:00
parent 5165cf6d15
commit 64baa41e64
3 changed files with 47 additions and 9 deletions

View File

@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/** Responsible for expanding a query, that is creating alternative branches of query execution
* to increase the number of results
@ -23,7 +24,8 @@ public class QueryExpansion {
private final List<ExpansionStrategy> expansionStrategies = List.of(
this::joinDashes,
this::splitWordNum,
this::joinTerms
this::joinTerms,
this::ngramAll
);
@Inject
@ -63,6 +65,22 @@ public class QueryExpansion {
}
public void ngramAll(QWordGraph graph) {
List<QWord> parts = new ArrayList<>();
for (var qw : graph) {
if (qw.isBeg() || qw.isEnd())
continue;
parts.add(qw);
}
if (parts.size() > 1) {
graph.addVariantForSpan(parts.getFirst(), parts.getLast(),
parts.stream().map(QWord::word).collect(Collectors.joining("_")));
}
}
// Turn 'MP3' into 'MP-3'
public void splitWordNum(QWordGraph graph) {
for (var qw : graph) {

View File

@ -2,6 +2,8 @@ package nu.marginalia.functions.searchquery.query_parser.model;
import ca.rmen.porterstemmer.PorterStemmer;
import java.util.Objects;
public record QWord(
int ord,
boolean variant,
@ -48,4 +50,22 @@ public record QWord(
public String toString() {
return STR."q{\{word}}";
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
QWord qWord = (QWord) o;
return variant == qWord.variant && Objects.equals(word, qWord.word) && Objects.equals(stemmed, qWord.stemmed) && Objects.equals(isOriginal(), qWord.isOriginal());
}
@Override
public int hashCode() {
int result = Boolean.hashCode(variant);
result = 31 * result + Objects.hashCode(stemmed);
result = 31 * result + Objects.hashCode(word);
result = 31 * result + Objects.hashCode(isOriginal());
return result;
}
}

View File

@ -19,8 +19,8 @@ public class QWordGraph implements Iterable<QWord> {
public record QWordGraphLink(QWord from, QWord to) {}
private final List<QWordGraphLink> links = new ArrayList<>();
private final Map<QWord, List<QWord>> fromTo = new HashMap<>();
private final Map<QWord, List<QWord>> toFrom = new HashMap<>();
private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
private int wordId = 0;
@ -79,8 +79,8 @@ public class QWordGraph implements Iterable<QWord> {
public void addLink(QWord from, QWord to) {
links.add(new QWordGraphLink(from, to));
fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to);
toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from);
fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);
toFrom.computeIfAbsent(to.ord(), k -> new ArrayList<>()).add(from);
}
public List<QWordGraphLink> links() {
@ -103,20 +103,20 @@ public class QWordGraph implements Iterable<QWord> {
}
public List<QWord> getNext(QWord word) {
return fromTo.getOrDefault(word, List.of());
return fromTo.getOrDefault(word.ord(), List.of());
}
public List<QWord> getNextOriginal(QWord word) {
return fromTo.getOrDefault(word, List.of())
return fromTo.getOrDefault(word.ord(), List.of())
.stream()
.filter(QWord::isOriginal)
.toList();
}
public List<QWord> getPrev(QWord word) {
return toFrom.getOrDefault(word, List.of());
return toFrom.getOrDefault(word.ord(), List.of());
}
public List<QWord> getPrevOriginal(QWord word) {
return toFrom.getOrDefault(word, List.of())
return toFrom.getOrDefault(word.ord(), List.of())
.stream()
.filter(QWord::isOriginal)
.toList();