Don't consider slash to be a sentence separator.

This commit is contained in:
Viktor Lofgren 2023-05-31 16:54:30 +02:00
parent e332faa07e
commit d82a858491
2 changed files with 9 additions and 1 deletions

View File

@ -22,6 +22,8 @@ import java.nio.file.Path;
import java.util.*;
import java.util.regex.Pattern;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class SentenceExtractorTest {
SentenceExtractor newSe;
@ -123,6 +125,12 @@ class SentenceExtractorTest {
}
@Test
public void testACDC() {
var ret = newSe.extractSentence("AC/DC is a rock band.");
assertEquals("AC/DC", ret.words[0]);
}
final Pattern p = Pattern.compile("([, ]+)");
public void seprateExtractor(String sentence) {
var matcher = p.matcher(sentence);

View File

@ -23,7 +23,7 @@ public class SentenceSegmentSplitter {
}
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
private static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
private static final Pattern wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
public static SeparatedSentence splitSegment(String segment) {
String flatSegment = AsciiFlattener.flattenUnicode(segment);