From b62f043910d618e38debd27aa054a21b6c24f197 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 5 Jan 2025 20:50:27 +0100 Subject: [PATCH 1/3] (search) Adjust token formation rules to be more lenient to C++ and PHP code. This addresses Issue #142 --- .../language/sentence/SentenceSegmentSplitter.java | 2 +- .../language/sentence/SentenceExtractorTest.java | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 531f5189..62f7f772 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -27,7 +27,7 @@ public class SentenceSegmentSplitter { else { // If we flatten unicode, we do this... // FIXME: This can almost definitely be cleaned up and simplified. - wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); + wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); } } diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java index 38ccbe12..1fcf1015 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java @@ -28,6 +28,20 @@ class SentenceExtractorTest { System.out.println(dld); } + @Test + void testCplusplus() { + var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class)); + assertEquals(1, dld.length()); + assertEquals("std::vector", dld.wordsLowerCase[0]); + } + + @Test + void testPHP() { + var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class)); + assertEquals(1, dld.length()); + assertEquals("$_get", dld.wordsLowerCase[0]); + } + @Test void testPolishArtist() { var dld = sentenceExtractor.extractSentence("UklaƄski", EnumSet.noneOf(HtmlTag.class)); From 94e1aa0baf138f4cf4bfebc939381a6d535de37d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 5 Jan 2025 20:55:44 +0100 Subject: [PATCH 2/3] (search) Tweak token formation to still break apart emails in brackets. --- .../language/sentence/SentenceSegmentSplitter.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 62f7f772..b8f0087d 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -90,12 +90,16 @@ public class SentenceSegmentSplitter { for (int i = 0; i < ret.size(); i++) { String part = ret.get(i); - if (part.startsWith("'") && part.length() > 1) { + if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) { + ret.set(i, part.substring(1, part.length() - 1)); + } + else if (part.startsWith("'") && part.length() > 1) { ret.set(i, part.substring(1)); } - if (part.endsWith("'") && part.length() > 1) { + else if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); } + while (part.endsWith(".")) { part = part.substring(0, part.length()-1); ret.set(i, part); From dcad0d786328094b4b4de0a98bdf3f702d11caed Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 5 Jan 2025 21:01:09 +0100 Subject: [PATCH 3/3] (search) Tweak token formation. --- .../language/sentence/SentenceSegmentSplitter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index b8f0087d..1612b9db 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -93,10 +93,11 @@ public class SentenceSegmentSplitter { if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) { ret.set(i, part.substring(1, part.length() - 1)); } - else if (part.startsWith("'") && part.length() > 1) { + + if (part.startsWith("'") && part.length() > 1) { ret.set(i, part.substring(1)); } - else if (part.endsWith("'") && part.length() > 1) { + if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); }