mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Don't consider slash to be a sentence separator.
This commit is contained in:
parent
e332faa07e
commit
d82a858491
@ -22,6 +22,8 @@ import java.nio.file.Path;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class SentenceExtractorTest {
|
class SentenceExtractorTest {
|
||||||
SentenceExtractor newSe;
|
SentenceExtractor newSe;
|
||||||
@ -123,6 +125,12 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testACDC() {
|
||||||
|
var ret = newSe.extractSentence("AC/DC is a rock band.");
|
||||||
|
assertEquals("AC/DC", ret.words[0]);
|
||||||
|
}
|
||||||
|
|
||||||
final Pattern p = Pattern.compile("([, ]+)");
|
final Pattern p = Pattern.compile("([, ]+)");
|
||||||
public void seprateExtractor(String sentence) {
|
public void seprateExtractor(String sentence) {
|
||||||
var matcher = p.matcher(sentence);
|
var matcher = p.matcher(sentence);
|
||||||
|
@ -23,7 +23,7 @@ public class SentenceSegmentSplitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||||
private static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
private static final Pattern wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||||
|
|
||||||
public static SeparatedSentence splitSegment(String segment) {
|
public static SeparatedSentence splitSegment(String segment) {
|
||||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||||
|
Loading…
Reference in New Issue
Block a user