diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index 5c726644..2bb46f9f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -3,16 +3,17 @@ package nu.marginalia.functions.searchquery.query_parser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.util.transform_list.TransformList; +import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; public class QueryParser { - private final QueryTokenizer tokenizer = new QueryTokenizer(); - public List parse(String query) { - List basicTokens = tokenizer.tokenizeQuery(query); + List basicTokens = tokenizeQuery(query); TransformList list = new TransformList<>(basicTokens); @@ -27,6 +28,84 @@ public class QueryParser { return list.getBackingList(); } + private static final Pattern noisePattern = Pattern.compile("[,\\s]"); + + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); + + String query = AsciiFlattener.flattenUnicode(rawQuery); + query = noisePattern.matcher(query).replaceAll(" "); + + int chr = -1; + int prevChr = -1; + for (int i = 0; i < query.length(); i++) { + prevChr = chr; + chr = query.charAt(i); + + boolean escape = prevChr == '\\'; + + if (!escape && '(' == chr) { + tokens.add(new QueryToken.LParen()); + } + else if (!escape && ')' == chr && prevChr != '(') { // special case to deal with queries like "strlen()" + tokens.add(new QueryToken.RParen()); + } + else if (!escape && '"' == chr) { + int end = query.indexOf('"', i+1); + + if (end == -1) { + end = query.length(); + } + + tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); + + i = end; + } + else if (!escape && '-' == chr) { + tokens.add(new QueryToken.Minus()); + } + else if (!escape && '?' == chr) { + tokens.add(new QueryToken.QMark()); + } + else if (!Character.isSpaceChar(chr)) { + + int end = i+1; + for (; end < query.length(); end++) { + if (query.charAt(end) == ' ' || query.charAt(end) == ')') + break; + } + + String displayStr = query.substring(i, end); + String str = trimEscape(displayStr.toLowerCase()); + + tokens.add(new QueryToken.LiteralTerm(str, displayStr)); + + i = end-1; + } + } + return tokens; + } + + private String trimEscape(String str) { + if (!str.contains("\\")) { + return str; + } + + StringBuilder sb = new StringBuilder(str.length()); + for (int j = 0; j < str.length(); j++) { + char c = str.charAt(j); + if (c == '\\') { + if (j + 1 < str.length()) { + sb.append(str.charAt(j + 1)); + j++; + } + } else { + sb.append(c); + } + } + return sb.toString(); + } + private static void normalizeDomainName(TransformList.Entity entity) { var t = entity.value(); @@ -63,10 +142,12 @@ public class QueryParser { // Remove trailing punctuation int lastChar = str.charAt(str.length() - 1); - if (":.,!?$".indexOf(lastChar) >= 0) + if (":.,!?$'".indexOf(lastChar) >= 0) entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); // Remove term elements that aren't indexed by the search engine + if (str.endsWith("'s")) + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); if (str.endsWith("()")) entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); if (str.startsWith("$")) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java deleted file mode 100644 index 79179524..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ /dev/null @@ -1,91 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; -import nu.marginalia.language.encoding.AsciiFlattener; - -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.regex.Pattern; - -public class QueryTokenizer { - private static final Pattern noisePattern = Pattern.compile("[,\\s]"); - - public List tokenizeQuery(String rawQuery) { - List tokens = new ArrayList<>(); - - String query = AsciiFlattener.flattenUnicode(rawQuery); - query = noisePattern.matcher(query).replaceAll(" "); - - for (int i = 0; i < query.length(); i++) { - int chr = query.charAt(i); - - if ('(' == chr) { - tokens.add(new QueryToken.LParen()); - } - else if (')' == chr) { - tokens.add(new QueryToken.RParen()); - } - else if ('"' == chr) { - int end = query.indexOf('"', i+1); - - if (end == -1) { - end = query.length(); - } - - tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); - - i = end; - } - else if ('-' == chr) { - tokens.add(new QueryToken.Minus()); - } - else if ('?' == chr) { - tokens.add(new QueryToken.QMark()); - } - else if (Character.isSpaceChar(chr)) { - // - } - else { - - int end = i+1; - for (; end < query.length(); end++) { - if (query.charAt(end) == ' ' || query.charAt(end) == ')') - break; - } - - String displayStr = query.substring(i, end); - String str = toLowerCaseStripPossessive(displayStr); - - tokens.add(new QueryToken.LiteralTerm(str, displayStr)); - - i = end-1; - } - } - return tokens; - } - - public static String toLowerCaseStripPossessive(String word) { - String val = stripPossessive(word).toLowerCase(); - - if (Objects.equals(val, word)) { - return word; - } - - return val; - } - - public static String stripPossessive(String s) { - int end = s.length(); - - if (s.endsWith("'")) { - return s.substring(0, end-1); - } - - if (s.endsWith("'s") || s.endsWith("'S")) { - return s.substring(0, end-2); - } - - return s; - } -}