From 8290c19e24123c67b0dbc2e328dfcb5feff0986a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 3 Sep 2024 11:21:01 +0200 Subject: [PATCH] (query-parsing) Drop search term elements that aren't indexed by the search engine --- .../searchquery/query_parser/QueryParser.java | 14 ++++++++++++-- .../util/transform_list/TransformList.java | 8 +++++++- .../java/nu/marginalia/language/WordPatterns.java | 5 ----- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index 2c5eaed1..f77fd1ba 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -61,10 +61,20 @@ public class QueryParser { if (str.isBlank()) return; - if (str.endsWith(":") || str.endsWith(".")) { + // Remove trailing punctuation + int lastChar = str.charAt(str.length() - 1); + if (":.,!?$".indexOf(lastChar) >= 0) entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); - } + // Remove term elements that aren't indexed by the search engine + if (str.endsWith("()")) + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); + if (str.startsWith("$")) + entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr())); + + if (entity.isBlank()) { + entity.remove(); + } } private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { diff --git a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java index 62dd2e0a..a0dc6d7f 100644 --- a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java +++ b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java @@ -1,5 +1,7 @@ package nu.marginalia.util.transform_list; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; + import java.util.List; import java.util.function.BiConsumer; import java.util.function.Consumer; @@ -30,7 +32,7 @@ import java.util.function.Predicate; * * */ -public class TransformList { +public class TransformList { private final List backingList; public TransformList(List backingList) { @@ -138,6 +140,10 @@ public class TransformList { value = newValue; } + public boolean isBlank() { + return value == null || value.str().isBlank(); + } + public void remove() { action = Action.REMOVE; } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java index 9f137ddc..c0990f22 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java @@ -1,11 +1,6 @@ package nu.marginalia.language; /** Logic for deciding which words are eligible to be keywords. - *

- * This is in dire need of oversight. Here be towering dragons with names, - * a skull next to their HP bar, and their own Mick Gordon soundtrack just - * for the battle. - * */ public class WordPatterns { public static final int MIN_WORD_LENGTH = 1;