mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Implement working optional TermCoherences
This commit is contained in:
parent
8ee64c0771
commit
95b9af92a0
@ -23,6 +23,7 @@ dependencies {
|
|||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:index:query')
|
implementation project(':code:index:query')
|
||||||
|
implementation project(':code:libraries:language-processing')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
@ -86,7 +86,8 @@ public class IndexProtobufCodec {
|
|||||||
for (var coherences : searchQuery.searchTermCoherences) {
|
for (var coherences : searchQuery.searchTermCoherences) {
|
||||||
subqueryBuilder.addCoherencesBuilder()
|
subqueryBuilder.addCoherencesBuilder()
|
||||||
.addAllCoherences(coherences.terms())
|
.addAllCoherences(coherences.terms())
|
||||||
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
|
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
return subqueryBuilder.build();
|
return subqueryBuilder.build();
|
||||||
|
@ -1,23 +1,71 @@
|
|||||||
package nu.marginalia.api.searchquery.model.query;
|
package nu.marginalia.api.searchquery.model.query;
|
||||||
|
|
||||||
|
import nu.marginalia.language.WordPatterns;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
|
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
|
||||||
public static SearchCoherenceConstraint mandatory(String... terms) {
|
|
||||||
return new SearchCoherenceConstraint(true, List.of(terms));
|
|
||||||
}
|
|
||||||
public static SearchCoherenceConstraint mandatory(List<String> terms) {
|
|
||||||
return new SearchCoherenceConstraint(true, List.copyOf(terms));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static SearchCoherenceConstraint optional(String... terms) {
|
|
||||||
return new SearchCoherenceConstraint(false, List.of(terms));
|
|
||||||
}
|
|
||||||
public static SearchCoherenceConstraint optional(List<String> terms) {
|
|
||||||
return new SearchCoherenceConstraint(false, List.copyOf(terms));
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return terms.size();
|
return terms.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||||
|
* Stop words are replaced with empty strings.
|
||||||
|
*/
|
||||||
|
public static SearchCoherenceConstraint mandatory(String... terms) {
|
||||||
|
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||||
|
}
|
||||||
|
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||||
|
* Stop words are replaced with empty strings.
|
||||||
|
*/
|
||||||
|
public static SearchCoherenceConstraint mandatory(List<String> terms) {
|
||||||
|
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||||
|
}
|
||||||
|
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||||
|
* Stop words are replaced with empty strings.
|
||||||
|
*/
|
||||||
|
public static SearchCoherenceConstraint optional(String... terms) {
|
||||||
|
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||||
|
}
|
||||||
|
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||||
|
* Stop words are replaced with empty strings.
|
||||||
|
*/
|
||||||
|
public static SearchCoherenceConstraint optional(List<String> terms) {
|
||||||
|
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> trimStopWords(List<String> terms) {
|
||||||
|
List<String> ret = new ArrayList<>(terms.size());
|
||||||
|
for (var term : terms) {
|
||||||
|
if (WordPatterns.isStopWord(term)) {
|
||||||
|
ret.add("");
|
||||||
|
} else {
|
||||||
|
ret.add(term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return List.copyOf(ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> trimStopWords(String... terms) {
|
||||||
|
List<String> ret = new ArrayList<>(terms.length);
|
||||||
|
for (var term : terms) {
|
||||||
|
if (WordPatterns.isStopWord(term)) {
|
||||||
|
ret.add("");
|
||||||
|
} else {
|
||||||
|
ret.add(term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
|
||||||
|
ret.removeFirst();
|
||||||
|
}
|
||||||
|
while (!ret.isEmpty() && "".equals(ret.getLast())) {
|
||||||
|
ret.removeLast();
|
||||||
|
}
|
||||||
|
|
||||||
|
return List.copyOf(ret);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -166,6 +166,11 @@ public class QueryExpansion {
|
|||||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// also create a segmentation that is just the entire query
|
||||||
|
coherences.add(nodes.stream()
|
||||||
|
.map(QWord::word)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
return coherences;
|
return coherences;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,23 +75,18 @@ public class QueryFactory {
|
|||||||
|
|
||||||
String[] parts = StringUtils.split(str, '_');
|
String[] parts = StringUtils.split(str, '_');
|
||||||
|
|
||||||
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
|
if (parts.length > 1) {
|
||||||
// required in the query (which is a problem because they are not indexed). How to do this
|
// Require that the terms appear in sequence
|
||||||
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
|
|
||||||
// improved.
|
|
||||||
|
|
||||||
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
|
|
||||||
// Prefer that the actual n-gram is present
|
|
||||||
searchTermsAdvice.add(str);
|
|
||||||
|
|
||||||
// Require that the terms appear in the same sentence
|
|
||||||
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
|
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
|
||||||
|
|
||||||
// Require that each term exists in the document
|
// Construct a regular query from the parts in the quoted string
|
||||||
// (needed for ranking)
|
|
||||||
searchTermsInclude.addAll(Arrays.asList(parts));
|
searchTermsInclude.addAll(Arrays.asList(parts));
|
||||||
|
|
||||||
|
// Prefer that the actual n-gram is present
|
||||||
|
searchTermsPriority.add(str);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
// If the quoted word is a single word, we don't need to do more than include it in the search
|
||||||
searchTermsInclude.add(str);
|
searchTermsInclude.add(str);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -229,4 +229,12 @@ public class QueryFactoryTest {
|
|||||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
System.out.println(subquery);
|
System.out.println(subquery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExpansion8() {
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
var subquery = parseAndGetSpecs("success often consists of");
|
||||||
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
|
System.out.println(subquery);
|
||||||
|
}
|
||||||
}
|
}
|
@ -74,6 +74,8 @@ public class IndexResultValuationContext {
|
|||||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||||
int docSize = index.getDocumentSize(docId);
|
int docSize = index.getDocumentSize(docId);
|
||||||
|
|
||||||
|
int bestCoherence = searchTerms.coherences.testOptional(positions);
|
||||||
|
|
||||||
double score = searchResultValuator.calculateSearchResultValue(
|
double score = searchResultValuator.calculateSearchResultValue(
|
||||||
wordFlagsQuery,
|
wordFlagsQuery,
|
||||||
positionsCountQuery,
|
positionsCountQuery,
|
||||||
@ -81,8 +83,8 @@ public class IndexResultValuationContext {
|
|||||||
docMetadata,
|
docMetadata,
|
||||||
htmlFeatures,
|
htmlFeatures,
|
||||||
docSize,
|
docSize,
|
||||||
rankingContext,
|
bestCoherence,
|
||||||
null);
|
rankingContext, null);
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||||
docMetadata,
|
docMetadata,
|
||||||
|
@ -40,6 +40,7 @@ public class ResultValuator {
|
|||||||
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
int length,
|
int length,
|
||||||
|
int bestCoherence,
|
||||||
ResultRankingContext ctx,
|
ResultRankingContext ctx,
|
||||||
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
||||||
)
|
)
|
||||||
@ -83,7 +84,8 @@ public class ResultValuator {
|
|||||||
+ rankingBonus
|
+ rankingBonus
|
||||||
+ topologyBonus
|
+ topologyBonus
|
||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty;
|
+ flagsPenalty
|
||||||
|
+ bestCoherence;
|
||||||
|
|
||||||
// FIXME: need a weighting factor here
|
// FIXME: need a weighting factor here
|
||||||
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
||||||
|
Loading…
Reference in New Issue
Block a user