mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Implement working optional TermCoherences
This commit is contained in:
parent
8ee64c0771
commit
95b9af92a0
@ -23,6 +23,7 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
|
@ -86,7 +86,8 @@ public class IndexProtobufCodec {
|
||||
for (var coherences : searchQuery.searchTermCoherences) {
|
||||
subqueryBuilder.addCoherencesBuilder()
|
||||
.addAllCoherences(coherences.terms())
|
||||
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
|
||||
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
|
||||
.build();
|
||||
}
|
||||
|
||||
return subqueryBuilder.build();
|
||||
|
@ -1,23 +1,71 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
|
||||
public static SearchCoherenceConstraint mandatory(String... terms) {
|
||||
return new SearchCoherenceConstraint(true, List.of(terms));
|
||||
}
|
||||
public static SearchCoherenceConstraint mandatory(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(true, List.copyOf(terms));
|
||||
}
|
||||
|
||||
public static SearchCoherenceConstraint optional(String... terms) {
|
||||
return new SearchCoherenceConstraint(false, List.of(terms));
|
||||
}
|
||||
public static SearchCoherenceConstraint optional(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(false, List.copyOf(terms));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return terms.size();
|
||||
}
|
||||
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint mandatory(String... terms) {
|
||||
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint mandatory(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint optional(String... terms) {
|
||||
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint optional(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(List<String> terms) {
|
||||
List<String> ret = new ArrayList<>(terms.size());
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(String... terms) {
|
||||
List<String> ret = new ArrayList<>(terms.length);
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
|
||||
ret.removeFirst();
|
||||
}
|
||||
while (!ret.isEmpty() && "".equals(ret.getLast())) {
|
||||
ret.removeLast();
|
||||
}
|
||||
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -166,6 +166,11 @@ public class QueryExpansion {
|
||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||
}
|
||||
|
||||
// also create a segmentation that is just the entire query
|
||||
coherences.add(nodes.stream()
|
||||
.map(QWord::word)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return coherences;
|
||||
}
|
||||
|
||||
|
@ -75,23 +75,18 @@ public class QueryFactory {
|
||||
|
||||
String[] parts = StringUtils.split(str, '_');
|
||||
|
||||
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
|
||||
// required in the query (which is a problem because they are not indexed). How to do this
|
||||
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
|
||||
// improved.
|
||||
|
||||
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
|
||||
// Prefer that the actual n-gram is present
|
||||
searchTermsAdvice.add(str);
|
||||
|
||||
// Require that the terms appear in the same sentence
|
||||
if (parts.length > 1) {
|
||||
// Require that the terms appear in sequence
|
||||
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
|
||||
|
||||
// Require that each term exists in the document
|
||||
// (needed for ranking)
|
||||
// Construct a regular query from the parts in the quoted string
|
||||
searchTermsInclude.addAll(Arrays.asList(parts));
|
||||
|
||||
// Prefer that the actual n-gram is present
|
||||
searchTermsPriority.add(str);
|
||||
}
|
||||
else {
|
||||
// If the quoted word is a single word, we don't need to do more than include it in the search
|
||||
searchTermsInclude.add(str);
|
||||
}
|
||||
}
|
||||
|
@ -229,4 +229,12 @@ public class QueryFactoryTest {
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion8() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("success often consists of");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
}
|
@ -74,6 +74,8 @@ public class IndexResultValuationContext {
|
||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
|
||||
int bestCoherence = searchTerms.coherences.testOptional(positions);
|
||||
|
||||
double score = searchResultValuator.calculateSearchResultValue(
|
||||
wordFlagsQuery,
|
||||
positionsCountQuery,
|
||||
@ -81,8 +83,8 @@ public class IndexResultValuationContext {
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
docSize,
|
||||
rankingContext,
|
||||
null);
|
||||
bestCoherence,
|
||||
rankingContext, null);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
docMetadata,
|
||||
|
@ -40,6 +40,7 @@ public class ResultValuator {
|
||||
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
int bestCoherence,
|
||||
ResultRankingContext ctx,
|
||||
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
||||
)
|
||||
@ -83,7 +84,8 @@ public class ResultValuator {
|
||||
+ rankingBonus
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
+ flagsPenalty
|
||||
+ bestCoherence;
|
||||
|
||||
// FIXME: need a weighting factor here
|
||||
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
||||
|
Loading…
Reference in New Issue
Block a user