Better handling of quote terms, fix bug in handling of longer queries.

... where some terms may previously have been ignored. The latter bug was due to the handling of QueryHeads with AnyOf-style predicates interacting poorly with alreadyConsideredTerms in SearchIndex.java
This commit is contained in:
Viktor Lofgren 2023-04-10 13:11:40 +02:00
parent 810515c08d
commit fe419b12b4
12 changed files with 146 additions and 72 deletions

View File

@ -10,7 +10,7 @@ import java.util.stream.Collectors;
@AllArgsConstructor @AllArgsConstructor
public class SearchSubquery { public class SearchSubquery {
/** These terms must be present in the document */ /** These terms must be present in the document and are used in ranking*/
public final List<String> searchTermsInclude; public final List<String> searchTermsInclude;
/** These terms must be absent from the document */ /** These terms must be absent from the document */
@ -22,17 +22,21 @@ public class SearchSubquery {
/** If these optional terms are present in the document, rank it highly */ /** If these optional terms are present in the document, rank it highly */
public final List<String> searchTermsPriority; public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */
public final List<List<String>> searchTermCoherences;
private double value = 0; private double value = 0;
public SearchSubquery(List<String> searchTermsInclude, public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude, List<String> searchTermsExclude,
List<String> searchTermsAdvice, List<String> searchTermsAdvice,
List<String> searchTermsPriority List<String> searchTermsPriority,
) { List<List<String>> searchTermCoherences) {
this.searchTermsInclude = searchTermsInclude; this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude; this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice; this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority; this.searchTermsPriority = searchTermsPriority;
this.searchTermCoherences = searchTermCoherences;
} }
public SearchSubquery setValue(double value) { public SearchSubquery setValue(double value) {
@ -51,6 +55,7 @@ public class SearchSubquery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString(); return sb.toString();
} }

View File

@ -6,9 +6,7 @@ import static java.lang.Boolean.compare;
import static java.lang.Double.compare; import static java.lang.Double.compare;
public record SearchResultPreliminaryScore( public record SearchResultPreliminaryScore(
boolean anyAllSynthetic, boolean disqualified,
int minNumberOfFlagsSet,
int minPositionsSet,
boolean hasPriorityTerm, boolean hasPriorityTerm,
double searchRankingScore) double searchRankingScore)
implements Comparable<SearchResultPreliminaryScore> implements Comparable<SearchResultPreliminaryScore>
@ -27,16 +25,7 @@ public record SearchResultPreliminaryScore(
return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore); return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore);
} }
public boolean isEmpty() { public boolean isDisqualified() {
if (minNumberOfFlagsSet > 0) return disqualified;
return false;
if (anyAllSynthetic)
return false;
if (minPositionsSet > 0)
return false;
return true;
} }
} }

View File

@ -110,13 +110,14 @@ public class SearchIndex {
// that contain pairs of two search terms // that contain pairs of two search terms
if (orderedIncludes.length > 1) { if (orderedIncludes.length > 1) {
for (int i = 0; i + 1 < orderedIncludes.length; i++) { for (int i = 0; i + 1 < orderedIncludes.length; i++) {
var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length); for (int j = i + 1; j < orderedIncludes.length; j++) {
var entrySource = indexReader var entrySource = indexReader
.findPriorityWord(orderedIncludes[i]) .findPriorityWord(orderedIncludes[i])
.alsoPrioAnyOf(remainingWords); .alsoPrio(orderedIncludes[j]);
queryHeads.add(entrySource); queryHeads.add(entrySource);
} }
} }
}
// Next consider entries that appear only once in the priority index // Next consider entries that appear only once in the priority index
for (var wordId : orderedIncludes) { for (var wordId : orderedIncludes) {

View File

@ -5,9 +5,18 @@ import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) { import java.util.Collections;
import java.util.List;
public record SearchIndexSearchTerms(
IntList includes,
IntList excludes,
IntList priority,
List<IntList> coherences
)
{
public SearchIndexSearchTerms() { public SearchIndexSearchTerms() {
this(IntList.of(), IntList.of(), IntList.of()); this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList());
} }
public boolean isEmpty() { public boolean isEmpty() {

View File

@ -8,8 +8,10 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.svc.SearchTermsService; import nu.marginalia.index.svc.SearchTermsService;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.ResultValuator; import nu.marginalia.ranking.ResultValuator;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.OptionalInt; import java.util.OptionalInt;
@ -74,7 +76,27 @@ public class IndexMetadataService {
} }
} }
return new QuerySearchTerms(termToId, termIdsList.toIntArray());
return new QuerySearchTerms(termToId,
termIdsList.toIntArray(),
getTermCoherences(searchTermVariants));
}
private TermCoherences getTermCoherences(List<SearchSubquery> searchTermVariants) {
List<int[]> coherences = new ArrayList<>();
for (var subquery : searchTermVariants) {
for (var coh : subquery.searchTermCoherences) {
int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray();
coherences.add(ids);
}
// It's assumed each subquery has identical coherences
break;
}
return new TermCoherences(coherences);
} }
public TLongHashSet getResultsWithPriorityTerms(List<SearchSubquery> subqueries, long[] resultsArray) { public TLongHashSet getResultsWithPriorityTerms(List<SearchSubquery> subqueries, long[] resultsArray) {
@ -116,15 +138,32 @@ public class IndexMetadataService {
return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0);
} }
public boolean testCoherence(long docId, TermCoherences coherences) {
for (var coherenceSet : coherences.words()) {
long overlap = 0xFF_FFFF_FFFF_FFFFL;
for (var word : coherenceSet) {
overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId));
}
if (overlap == 0L) {
return false;
}
}
return true;
}
} }
public static class QuerySearchTerms { public static class QuerySearchTerms {
private final TObjectIntHashMap<String> termToId; private final TObjectIntHashMap<String> termToId;
public final int[] termIdsAll; public final int[] termIdsAll;
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll) { public final TermCoherences coherences;
public QuerySearchTerms(TObjectIntHashMap<String> termToId, int[] termIdsAll, TermCoherences coherences) {
this.termToId = termToId; this.termToId = termToId;
this.termIdsAll = termIdsAll; this.termIdsAll = termIdsAll;
this.coherences = coherences;
} }
public int get(String searchTerm) { public int get(String searchTerm) {
@ -132,6 +171,8 @@ public class IndexMetadataService {
} }
} }
public record TermCoherences(List<int[]> words) {}
private static long termdocKey(int termId, long docId) { private static long termdocKey(int termId, long docId) {
return (docId << 32) | termId; return (docId << 32) | termId;
} }

View File

@ -117,10 +117,15 @@ public class IndexResultValuator {
double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext); double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext);
boolean disqualified = false;
if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences))
disqualified = true;
else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0)
disqualified = true;
searchResult.setScore(new SearchResultPreliminaryScore( searchResult.setScore(new SearchResultPreliminaryScore(
anyAllSynthetic, disqualified,
maxFlagsCount,
maxPositionsSet,
hasPriorityTerm, hasPriorityTerm,
score score
)); ));
@ -140,6 +145,7 @@ public class IndexResultValuator {
return false; return false;
} }
} }
return true; return true;
} }

View File

@ -155,6 +155,7 @@ public class IndexQueryService {
outer: outer:
// These queries are various term combinations // These queries are various term combinations
for (var subquery : params.subqueries) { for (var subquery : params.subqueries) {
final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery); final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery);
if (searchTerms.isEmpty()) { if (searchTerms.isEmpty()) {
@ -195,16 +196,20 @@ public class IndexQueryService {
} }
var includes = subquery.searchTermsInclude; var includes = subquery.searchTermsInclude;
var advice = subquery.searchTermsAdvice;
var excludes = subquery.searchTermsExclude; var excludes = subquery.searchTermsExclude;
var priority = subquery.searchTermsPriority; var priority = subquery.searchTermsPriority;
for (int i = 0; i < subquery.searchTermsInclude.size(); i++) { for (int i = 0; i < includes.size(); i++) {
logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i)); logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i));
} }
for (int i = 0; i < subquery.searchTermsExclude.size(); i++) { for (int i = 0; i < advice.size(); i++) {
logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i));
}
for (int i = 0; i < excludes.size(); i++) {
logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i)); logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i));
} }
for (int i = 0; i < subquery.searchTermsPriority.size(); i++) { for (int i = 0; i < priority.size(); i++) {
logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i)); logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i));
} }
} }
@ -247,7 +252,7 @@ public class IndexQueryService {
return Arrays.stream(resultIds.toArray()) return Arrays.stream(resultIds.toArray())
.parallel() .parallel()
.mapToObj(evaluator::calculatePreliminaryScore) .mapToObj(evaluator::calculatePreliminaryScore)
.filter(score -> !score.getScore().isEmpty()) .filter(score -> !score.getScore().isDisqualified())
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -11,10 +11,7 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.HashMap; import java.util.*;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
@Singleton @Singleton
public class SearchTermsService { public class SearchTermsService {
@ -30,34 +27,49 @@ public class SearchTermsService {
final IntList excludes = new IntArrayList(); final IntList excludes = new IntArrayList();
final IntList includes = new IntArrayList(); final IntList includes = new IntArrayList();
final IntList priority = new IntArrayList(); final IntList priority = new IntArrayList();
final List<IntList> coherences = new ArrayList<>();
for (var include : request.searchTermsInclude) { if (!addEachTerm(includes, request.searchTermsInclude)) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
return new SearchIndexSearchTerms(); return new SearchIndexSearchTerms();
} }
includes.add(word.getAsInt());
}
// This looks like a bug, but it's not
for (var advice : request.searchTermsAdvice) { // v--- ----v
var word = lookUpWord(advice); if (!addEachTerm(includes, request.searchTermsAdvice)) {
if (word.isEmpty()) {
logger.debug("Unknown search term: " + advice);
return new SearchIndexSearchTerms(); return new SearchIndexSearchTerms();
} }
includes.add(word.getAsInt());
for (var coherence : request.searchTermCoherences) {
IntList parts = new IntArrayList(coherence.size());
if (!addEachTerm(parts, coherence)) {
return new SearchIndexSearchTerms();
} }
for (var exclude : request.searchTermsExclude) { coherences.add(parts);
lookUpWord(exclude).ifPresent(excludes::add);
}
for (var exclude : request.searchTermsPriority) {
lookUpWord(exclude).ifPresent(priority::add);
} }
return new SearchIndexSearchTerms(includes, excludes, priority); // we don't care if we can't find these:
addEachTerm(excludes, request.searchTermsExclude);
addEachTerm(priority, request.searchTermsPriority);
return new SearchIndexSearchTerms(includes, excludes, priority, coherences);
}
private boolean addEachTerm(IntList ret, List<String> words) {
boolean success = true;
for (var exclude : words) {
var word = lookUpWord(exclude);
if (word.isPresent()) {
lookUpWord(exclude).ifPresent(ret::add);
}
else {
success = false;
}
}
return success;
} }

View File

@ -28,11 +28,9 @@ import spark.Spark;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@Execution(SAME_THREAD) @Execution(SAME_THREAD)
@ -91,8 +89,8 @@ public class IndexQueryServiceIntegrationTest {
.domains(new ArrayList<>()) .domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE) .searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new SearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
))).build()); Collections.emptyList()))).build());
Assertions.assertArrayEquals( Assertions.assertArrayEquals(
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
@ -123,8 +121,8 @@ public class IndexQueryServiceIntegrationTest {
.queryStrategy(QueryStrategy.SENTENCE) .queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2)) .domains(List.of(2))
.subqueries(List.of(new SearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
))).build()); Collections.emptyList()))).build());
Assertions.assertArrayEquals( Assertions.assertArrayEquals(
new int[] { 210, 270 }, new int[] { 210, 270 },
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray()); rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
@ -149,8 +147,8 @@ public class IndexQueryServiceIntegrationTest {
.searchSetIdentifier(SearchSetIdentifier.NONE) .searchSetIdentifier(SearchSetIdentifier.NONE)
.rankingParams(ResultRankingParameters.sensibleDefaults()) .rankingParams(ResultRankingParameters.sensibleDefaults())
.subqueries(List.of(new SearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
)) Collections.emptyList()))
).build()); ).build());
@ -167,8 +165,6 @@ public class IndexQueryServiceIntegrationTest {
} }
public void loadData(int id) { public void loadData(int id) {
int[] factors = IntStream int[] factors = IntStream
.rangeClosed(1, id) .rangeClosed(1, id)

View File

@ -83,8 +83,8 @@ public class QueryFactory {
Arrays.asList(termsInclude), Arrays.asList(termsInclude),
Collections.emptyList(), Collections.emptyList(),
Collections.emptyList(), Collections.emptyList(),
Collections.emptyList() Collections.emptyList(),
)); Collections.emptyList()));
var specs = SearchSpecification.builder() var specs = SearchSpecification.builder()
.subqueries(sqs) .subqueries(sqs)

View File

@ -9,17 +9,19 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
/** @see SearchSubquery */
public class QuerySearchTermsAccumulator implements TokenVisitor { public class QuerySearchTermsAccumulator implements TokenVisitor {
public List<String> searchTermsExclude = new ArrayList<>(); public List<String> searchTermsExclude = new ArrayList<>();
public List<String> searchTermsInclude = new ArrayList<>(); public List<String> searchTermsInclude = new ArrayList<>();
public List<String> searchTermsAdvice = new ArrayList<>(); public List<String> searchTermsAdvice = new ArrayList<>();
public List<String> searchTermsPriority = new ArrayList<>(); public List<String> searchTermsPriority = new ArrayList<>();
public List<List<String>> searchTermCoherences = new ArrayList<>();
public String near; public String near;
public String domain; public String domain;
public SearchSubquery createSubquery() { public SearchSubquery createSubquery() {
return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority); return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
} }
public QuerySearchTermsAccumulator(SearchProfile profile, List<Token> parts) { public QuerySearchTermsAccumulator(SearchProfile profile, List<Token> parts) {
@ -45,11 +47,19 @@ public class QuerySearchTermsAccumulator implements TokenVisitor {
public void onQuotTerm(Token token) { public void onQuotTerm(Token token) {
String[] parts = token.str.split("_"); String[] parts = token.str.split("_");
if (parts.length > 1) { if (parts.length > 1) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(token.str); searchTermsAdvice.add(token.str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
// Require that each term exists in the document
// (needed for ranking)
searchTermsInclude.addAll(Arrays.asList(parts)); searchTermsInclude.addAll(Arrays.asList(parts));
} }
else { else {
searchTermsInclude.add(token.str); searchTermsInclude.add(token.str);
} }
} }

View File

@ -29,7 +29,7 @@ public class LoadTestMain {
for (int i = 0; i < 10000; i++) { for (int i = 0; i < 10000; i++) {
String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted( String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted(
Strings.join(pickNCommonWords(2), '+') Strings.join(pickNCommonWords(4), '+')
); );
HttpRequest req = HttpRequest.newBuilder(new URI(uri)) HttpRequest req = HttpRequest.newBuilder(new URI(uri))