Better handling of quote terms, fix bug in handling of longer queries.

... where some terms may previously have been ignored. The latter bug was due to the handling of QueryHeads with AnyOf-style predicates interacting poorly with alreadyConsideredTerms in SearchIndex.java
This commit is contained in:
Viktor Lofgren 2023-04-10 13:11:40 +02:00
parent 810515c08d
commit fe419b12b4
12 changed files with 146 additions and 72 deletions

View File

@ -10,7 +10,7 @@ import java.util.stream.Collectors;
@AllArgsConstructor
public class SearchSubquery {
/** These terms must be present in the document */
/** These terms must be present in the document and are used in ranking*/
public final List<String> searchTermsInclude;
/** These terms must be absent from the document */
@ -21,18 +21,22 @@ public class SearchSubquery {
/** If these optional terms are present in the document, rank it highly */
public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */
public final List<List<String>> searchTermCoherences;
private double value = 0;
public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority
) {
List<String> searchTermsPriority,
List<List<String>> searchTermCoherences) {
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority;
this.searchTermCoherences = searchTermCoherences;
}
public SearchSubquery setValue(double value) {
@ -51,6 +55,7 @@ public class SearchSubquery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString();
}

View File

@ -6,9 +6,7 @@ import static java.lang.Boolean.compare;
import static java.lang.Double.compare;
public record SearchResultPreliminaryScore(
boolean anyAllSynthetic,
int minNumberOfFlagsSet,
int minPositionsSet,
boolean disqualified,
boolean hasPriorityTerm,
double searchRankingScore)
implements Comparable<SearchResultPreliminaryScore>
@ -27,16 +25,7 @@ public record SearchResultPreliminaryScore(
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
}
public boolean isEmpty() {
if (minNumberOfFlagsSet > 0)
return false;
if (anyAllSynthetic)
return false;
if (minPositionsSet > 0)
return false;
return true;
public boolean isDisqualified() {
return disqualified;
}
}

View File

@ -110,11 +110,12 @@ public class SearchIndex {
// that contain pairs of two search terms
if (orderedIncludes.length > 1) {
for (int i = 0; i + 1 < orderedIncludes.length; i++) {
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length);
var entrySource = indexReader
.findPriorityWord(orderedIncludes[i])
.alsoPrioAnyOf(remainingWords);
queryHeads.add(entrySource);
for (int j = i + 1; j < orderedIncludes.length; j++) {
var entrySource = indexReader
.findPriorityWord(orderedIncludes[i])
.alsoPrio(orderedIncludes[j]);
queryHeads.add(entrySource);
}
}
}

View File

@ -5,9 +5,18 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) {
import java.util.Collections;
import java.util.List;
public record SearchIndexSearchTerms(
IntList includes,
IntList excludes,
IntList priority,
List<IntList> coherences
)
{
public SearchIndexSearchTerms() {
this(IntList.of(), IntList.of(), IntList.of());
this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList());
}
public boolean isEmpty() {

View File

@ -8,8 +8,10 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.svc.SearchTermsService;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.ResultValuator;
import java.util.ArrayList;
import java.util.List;
import java.util.OptionalInt;
@ -74,7 +76,27 @@ public class IndexMetadataService {
}
}
return new QuerySearchTerms(termToId, termIdsList.toIntArray());
return new QuerySearchTerms(termToId,
termIdsList.toIntArray(),
getTermCoherences(searchTermVariants));
}
private TermCoherences getTermCoherences(List<SearchSubquery> searchTermVariants) {
List<int[]> coherences = new ArrayList<>();
for (var subquery : searchTermVariants) {
for (var coh : subquery.searchTermCoherences) {
int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray();
coherences.add(ids);
}
// It's assumed each subquery has identical coherences
break;
}
return new TermCoherences(coherences);
}
public TLongHashSet getResultsWithPriorityTerms(List<SearchSubquery> subqueries, long[] resultsArray) {
@ -116,15 +138,32 @@ public class IndexMetadataService {
return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0);
}
public boolean testCoherence(long docId, TermCoherences coherences) {
for (var coherenceSet : coherences.words()) {
long overlap = 0xFF_FFFF_FFFF_FFFFL;
for (var word : coherenceSet) {
overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId));
}
if (overlap == 0L) {
return false;
}
}
return true;
}
}
public static class QuerySearchTerms {
private final TObjectIntHashMap<String> termToId;
public final int[] termIdsAll;
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll) {
public final TermCoherences coherences;
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll, TermCoherences coherences) {
this.termToId = termToId;
this.termIdsAll = termIdsAll;
this.coherences = coherences;
}
public int get(String searchTerm) {
@ -132,6 +171,8 @@ public class IndexMetadataService {
}
}
public record TermCoherences(List<int[]> words) {}
private static long termdocKey(int termId, long docId) {
return (docId << 32) | termId;
}

View File

@ -117,10 +117,15 @@ public class IndexResultValuator {
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext);
boolean disqualified = false;
if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences))
disqualified = true;
else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
disqualified = true;
searchResult.setScore(new SearchResultPreliminaryScore(
anyAllSynthetic,
maxFlagsCount,
maxPositionsSet,
disqualified,
hasPriorityTerm,
score
));
@ -140,6 +145,7 @@ public class IndexResultValuator {
return false;
}
}
return true;
}

View File

@ -155,6 +155,7 @@ public class IndexQueryService {
outer:
// These queries are various term combinations
for (var subquery : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) {
@ -195,16 +196,20 @@ public class IndexQueryService {
}
var includes = subquery.searchTermsInclude;
var advice = subquery.searchTermsAdvice;
var excludes = subquery.searchTermsExclude;
var priority = subquery.searchTermsPriority;
for (int i = 0; i < subquery.searchTermsInclude.size(); i++) {
for (int i = 0; i < includes.size(); i++) {
logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i));
}
for (int i = 0; i < subquery.searchTermsExclude.size(); i++) {
for (int i = 0; i < advice.size(); i++) {
logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i));
}
for (int i = 0; i < excludes.size(); i++) {
logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i));
}
for (int i = 0; i < subquery.searchTermsPriority.size(); i++) {
for (int i = 0; i < priority.size(); i++) {
logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i));
}
}
@ -247,7 +252,7 @@ public class IndexQueryService {
return Arrays.stream(resultIds.toArray())
.parallel()
.mapToObj(evaluator::calculatePreliminaryScore)
.filter(score -> !score.getScore().isEmpty())
.filter(score -> !score.getScore().isDisqualified())
.collect(Collectors.toList());
}

View File

@ -11,10 +11,7 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.*;
@Singleton
public class SearchTermsService {
@ -30,34 +27,49 @@ public class SearchTermsService {
final IntList excludes = new IntArrayList();
final IntList includes = new IntArrayList();
final IntList priority = new IntArrayList();
final List<IntList> coherences = new ArrayList<>();
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
if (!addEachTerm(includes, request.searchTermsInclude)) {
return new SearchIndexSearchTerms();
}
// This looks like a bug, but it's not
// v--- ----v
if (!addEachTerm(includes, request.searchTermsAdvice)) {
return new SearchIndexSearchTerms();
}
for (var coherence : request.searchTermCoherences) {
IntList parts = new IntArrayList(coherence.size());
if (!addEachTerm(parts, coherence)) {
return new SearchIndexSearchTerms();
}
includes.add(word.getAsInt());
coherences.add(parts);
}
// we don't care if we can't find these:
addEachTerm(excludes, request.searchTermsExclude);
addEachTerm(priority, request.searchTermsPriority);
for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + advice);
return new SearchIndexSearchTerms();
return new SearchIndexSearchTerms(includes, excludes, priority, coherences);
}
private boolean addEachTerm(IntList ret, List<String> words) {
boolean success = true;
for (var exclude : words) {
var word = lookUpWord(exclude);
if (word.isPresent()) {
lookUpWord(exclude).ifPresent(ret::add);
}
else {
success = false;
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}
for (var exclude : request.searchTermsPriority) {
lookUpWord(exclude).ifPresent(priority::add);
}
return new SearchIndexSearchTerms(includes, excludes, priority);
return success;
}

View File

@ -28,11 +28,9 @@ import spark.Spark;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@Execution(SAME_THREAD)
@ -91,8 +89,8 @@ public class IndexQueryServiceIntegrationTest {
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build());
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
Assertions.assertArrayEquals(
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
@ -123,8 +121,8 @@ public class IndexQueryServiceIntegrationTest {
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build());
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
Assertions.assertArrayEquals(
new int[] { 210, 270 },
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
@ -149,8 +147,8 @@ public class IndexQueryServiceIntegrationTest {
.searchSetIdentifier(SearchSetIdentifier.NONE)
.rankingParams(ResultRankingParameters.sensibleDefaults())
.subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
))
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))
).build());
@ -167,8 +165,6 @@ public class IndexQueryServiceIntegrationTest {
}
public void loadData(int id) {
int[] factors = IntStream
.rangeClosed(1, id)

View File

@ -83,8 +83,8 @@ public class QueryFactory {
Arrays.asList(termsInclude),
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList()
));
Collections.emptyList(),
Collections.emptyList()));
var specs = SearchSpecification.builder()
.subqueries(sqs)

View File

@ -9,17 +9,19 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/** @see SearchSubquery */
public class QuerySearchTermsAccumulator implements TokenVisitor {
public List<String> searchTermsExclude = new ArrayList<>();
public List<String> searchTermsInclude = new ArrayList<>();
public List<String> searchTermsAdvice = new ArrayList<>();
public List<String> searchTermsPriority = new ArrayList<>();
public List<List<String>> searchTermCoherences = new ArrayList<>();
public String near;
public String domain;
public SearchSubquery createSubquery() {
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
}
public QuerySearchTermsAccumulator(SearchProfile profile, List<Token> parts) {
@ -45,11 +47,19 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
public void onQuotTerm(Token token) {
String[] parts = token.str.split("_");
if (parts.length > 1) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(token.str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
// Require that each term exists in the document
// (needed for ranking)
searchTermsInclude.addAll(Arrays.asList(parts));
}
else {
searchTermsInclude.add(token.str);
}
}

View File

@ -29,7 +29,7 @@ public class LoadTestMain {
for (int i = 0; i < 10000; i++) {
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
Strings.join(pickNCommonWords(2), '+')
Strings.join(pickNCommonWords(4), '+')
);
HttpRequest req = HttpRequest.newBuilder(new URI(uri))