diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 531f5189..62f7f772 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -27,7 +27,7 @@ public class SentenceSegmentSplitter { else { // If we flatten unicode, we do this... // FIXME: This can almost definitely be cleaned up and simplified. - wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); + wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); } } diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java index 38ccbe12..1fcf1015 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java @@ -28,6 +28,20 @@ class SentenceExtractorTest { System.out.println(dld); } + @Test + void testCplusplus() { + var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class)); + assertEquals(1, dld.length()); + assertEquals("std::vector", dld.wordsLowerCase[0]); + } + + @Test + void testPHP() { + var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class)); + assertEquals(1, dld.length()); + assertEquals("$_get", dld.wordsLowerCase[0]); + } + @Test void testPolishArtist() { var dld = sentenceExtractor.extractSentence("UklaƄski", EnumSet.noneOf(HtmlTag.class));