diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index b85497cc..a589f52f 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:index:query') + implementation project(':code:libraries:language-processing') implementation libs.bundles.slf4j diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 099dc573..3a57cfe6 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -86,7 +86,8 @@ public class IndexProtobufCodec { for (var coherences : searchQuery.searchTermCoherences) { subqueryBuilder.addCoherencesBuilder() .addAllCoherences(coherences.terms()) - .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL); + .setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL) + .build(); } return subqueryBuilder.build(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java index 0089cc3a..ce1e2e55 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchCoherenceConstraint.java @@ -1,23 +1,71 @@ package nu.marginalia.api.searchquery.model.query; +import nu.marginalia.language.WordPatterns; + +import java.util.ArrayList; import java.util.List; public record SearchCoherenceConstraint(boolean mandatory, List terms) { - public static SearchCoherenceConstraint mandatory(String... terms) { - return new SearchCoherenceConstraint(true, List.of(terms)); - } - public static SearchCoherenceConstraint mandatory(List terms) { - return new SearchCoherenceConstraint(true, List.copyOf(terms)); - } - - public static SearchCoherenceConstraint optional(String... terms) { - return new SearchCoherenceConstraint(false, List.of(terms)); - } - public static SearchCoherenceConstraint optional(List terms) { - return new SearchCoherenceConstraint(false, List.copyOf(terms)); - } public int size() { return terms.size(); } + + /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag. + * Stop words are replaced with empty strings. + */ + public static SearchCoherenceConstraint mandatory(String... terms) { + return new SearchCoherenceConstraint(true, trimStopWords(terms)); + } + /** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag. + * Stop words are replaced with empty strings. + */ + public static SearchCoherenceConstraint mandatory(List terms) { + return new SearchCoherenceConstraint(true, trimStopWords(terms)); + } + /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag. + * Stop words are replaced with empty strings. + */ + public static SearchCoherenceConstraint optional(String... terms) { + return new SearchCoherenceConstraint(false, trimStopWords(terms)); + } + /** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag. + * Stop words are replaced with empty strings. + */ + public static SearchCoherenceConstraint optional(List terms) { + return new SearchCoherenceConstraint(false, trimStopWords(terms)); + } + + private static List trimStopWords(List terms) { + List ret = new ArrayList<>(terms.size()); + for (var term : terms) { + if (WordPatterns.isStopWord(term)) { + ret.add(""); + } else { + ret.add(term); + } + } + return List.copyOf(ret); + } + + private static List trimStopWords(String... terms) { + List ret = new ArrayList<>(terms.length); + for (var term : terms) { + if (WordPatterns.isStopWord(term)) { + ret.add(""); + } else { + ret.add(term); + } + } + + while (!ret.isEmpty() && "".equals(ret.getFirst())) { + ret.removeFirst(); + } + while (!ret.isEmpty() && "".equals(ret.getLast())) { + ret.removeLast(); + } + + return List.copyOf(ret); + } + } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 2af0b586..5287c7d3 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -166,6 +166,11 @@ public class QueryExpansion { graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } + // also create a segmentation that is just the entire query + coherences.add(nodes.stream() + .map(QWord::word) + .collect(Collectors.toList())); + return coherences; } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 4b3e02dc..400ba998 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -75,23 +75,18 @@ public class QueryFactory { String[] parts = StringUtils.split(str, '_'); - // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being - // required in the query (which is a problem because they are not indexed). How to do this - // in a clean way is a bit of an open problem that may not get resolved until query-parsing is - // improved. - - if (parts.length > 1 && !anyPartIsStopWord(parts)) { - // Prefer that the actual n-gram is present - searchTermsAdvice.add(str); - - // Require that the terms appear in the same sentence + if (parts.length > 1) { + // Require that the terms appear in sequence searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts)); - // Require that each term exists in the document - // (needed for ranking) + // Construct a regular query from the parts in the quoted string searchTermsInclude.addAll(Arrays.asList(parts)); + + // Prefer that the actual n-gram is present + searchTermsPriority.add(str); } else { + // If the quoted word is a single word, we don't need to do more than include it in the search searchTermsInclude.add(str); } } diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index a6698dc7..88562307 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -229,4 +229,12 @@ public class QueryFactoryTest { System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println(subquery); } + + @Test + public void testExpansion8() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("success often consists of"); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + } } \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index f886dc42..2facf59f 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -74,6 +74,8 @@ public class IndexResultValuationContext { int htmlFeatures = index.getHtmlFeatures(docId); int docSize = index.getDocumentSize(docId); + int bestCoherence = searchTerms.coherences.testOptional(positions); + double score = searchResultValuator.calculateSearchResultValue( wordFlagsQuery, positionsCountQuery, @@ -81,8 +83,8 @@ public class IndexResultValuationContext { docMetadata, htmlFeatures, docSize, - rankingContext, - null); + bestCoherence, + rankingContext, null); SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index ae84a11e..379a1d9d 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -40,6 +40,7 @@ public class ResultValuator { CompiledQueryInt positionsCountQuery, CompiledQuery positionsQuery, long documentMetadata, int features, int length, + int bestCoherence, ResultRankingContext ctx, @Nullable Consumer detailsConsumer ) @@ -83,7 +84,8 @@ public class ResultValuator { + rankingBonus + topologyBonus + temporalBias - + flagsPenalty; + + flagsPenalty + + bestCoherence; // FIXME: need a weighting factor here double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);