(index) Implement working optional TermCoherences

This commit is contained in:
Viktor Lofgren 2024-06-26 12:22:06 +02:00
parent 8ee64c0771
commit 95b9af92a0
8 changed files with 91 additions and 29 deletions

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:index:query')
implementation project(':code:libraries:language-processing')
implementation libs.bundles.slf4j

View File

@ -86,7 +86,8 @@ public class IndexProtobufCodec {
for (var coherences : searchQuery.searchTermCoherences) {
subqueryBuilder.addCoherencesBuilder()
.addAllCoherences(coherences.terms())
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
.build();
}
return subqueryBuilder.build();

View File

@ -1,23 +1,71 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
public static SearchCoherenceConstraint mandatory(String... terms) {
return new SearchCoherenceConstraint(true, List.of(terms));
}
public static SearchCoherenceConstraint mandatory(List<String> terms) {
return new SearchCoherenceConstraint(true, List.copyOf(terms));
}
public static SearchCoherenceConstraint optional(String... terms) {
return new SearchCoherenceConstraint(false, List.of(terms));
}
public static SearchCoherenceConstraint optional(List<String> terms) {
return new SearchCoherenceConstraint(false, List.copyOf(terms));
}
public int size() {
return terms.size();
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(String... terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(List<String> terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(String... terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(List<String> terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -166,6 +166,11 @@ public class QueryExpansion {
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
}
// also create a segmentation that is just the entire query
coherences.add(nodes.stream()
.map(QWord::word)
.collect(Collectors.toList()));
return coherences;
}

View File

@ -75,23 +75,18 @@ public class QueryFactory {
String[] parts = StringUtils.split(str, '_');
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
// required in the query (which is a problem because they are not indexed). How to do this
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
// improved.
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(str);
// Require that the terms appear in the same sentence
if (parts.length > 1) {
// Require that the terms appear in sequence
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
// Require that each term exists in the document
// (needed for ranking)
// Construct a regular query from the parts in the quoted string
searchTermsInclude.addAll(Arrays.asList(parts));
// Prefer that the actual n-gram is present
searchTermsPriority.add(str);
}
else {
// If the quoted word is a single word, we don't need to do more than include it in the search
searchTermsInclude.add(str);
}
}

View File

@ -229,4 +229,12 @@ public class QueryFactoryTest {
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
@Test
public void testExpansion8() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("success often consists of");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
}
}

View File

@ -74,6 +74,8 @@ public class IndexResultValuationContext {
int htmlFeatures = index.getHtmlFeatures(docId);
int docSize = index.getDocumentSize(docId);
int bestCoherence = searchTerms.coherences.testOptional(positions);
double score = searchResultValuator.calculateSearchResultValue(
wordFlagsQuery,
positionsCountQuery,
@ -81,8 +83,8 @@ public class IndexResultValuationContext {
docMetadata,
htmlFeatures,
docSize,
rankingContext,
null);
bestCoherence,
rankingContext, null);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,

View File

@ -40,6 +40,7 @@ public class ResultValuator {
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
int features,
int length,
int bestCoherence,
ResultRankingContext ctx,
@Nullable Consumer<ResultRankingDetails> detailsConsumer
)
@ -83,7 +84,8 @@ public class ResultValuator {
+ rankingBonus
+ topologyBonus
+ temporalBias
+ flagsPenalty;
+ flagsPenalty
+ bestCoherence;
// FIXME: need a weighting factor here
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);