From 94e1aa0baf138f4cf4bfebc939381a6d535de37d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 5 Jan 2025 20:55:44 +0100 Subject: [PATCH] (search) Tweak token formation to still break apart emails in brackets. --- .../language/sentence/SentenceSegmentSplitter.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 62f7f772..b8f0087d 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -90,12 +90,16 @@ public class SentenceSegmentSplitter { for (int i = 0; i < ret.size(); i++) { String part = ret.get(i); - if (part.startsWith("'") && part.length() > 1) { + if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) { + ret.set(i, part.substring(1, part.length() - 1)); + } + else if (part.startsWith("'") && part.length() > 1) { ret.set(i, part.substring(1)); } - if (part.endsWith("'") && part.length() > 1) { + else if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); } + while (part.endsWith(".")) { part = part.substring(0, part.length()-1); ret.set(i, part);