mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Don't consider slash to be a sentence separator.
This commit is contained in:
parent
e332faa07e
commit
d82a858491
@ -22,6 +22,8 @@ import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class SentenceExtractorTest {
|
||||
SentenceExtractor newSe;
|
||||
@ -123,6 +125,12 @@ class SentenceExtractorTest {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testACDC() {
|
||||
var ret = newSe.extractSentence("AC/DC is a rock band.");
|
||||
assertEquals("AC/DC", ret.words[0]);
|
||||
}
|
||||
|
||||
final Pattern p = Pattern.compile("([, ]+)");
|
||||
public void seprateExtractor(String sentence) {
|
||||
var matcher = p.matcher(sentence);
|
||||
|
@ -23,7 +23,7 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
private static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
private static final Pattern wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
|
||||
public static SeparatedSentence splitSegment(String segment) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
|
Loading…
Reference in New Issue
Block a user