mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
Merge branch 'master' into serp-redesign
This commit is contained in:
commit
9ec9d1b338
@ -27,7 +27,7 @@ public class SentenceSegmentSplitter {
|
||||
else {
|
||||
// If we flatten unicode, we do this...
|
||||
// FIXME: This can almost definitely be cleaned up and simplified.
|
||||
wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
}
|
||||
}
|
||||
|
||||
@ -90,12 +90,17 @@ public class SentenceSegmentSplitter {
|
||||
for (int i = 0; i < ret.size(); i++) {
|
||||
String part = ret.get(i);
|
||||
|
||||
if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) {
|
||||
ret.set(i, part.substring(1, part.length() - 1));
|
||||
}
|
||||
|
||||
if (part.startsWith("'") && part.length() > 1) {
|
||||
ret.set(i, part.substring(1));
|
||||
}
|
||||
if (part.endsWith("'") && part.length() > 1) {
|
||||
ret.set(i, part.substring(0, part.length()-1));
|
||||
}
|
||||
|
||||
while (part.endsWith(".")) {
|
||||
part = part.substring(0, part.length()-1);
|
||||
ret.set(i, part);
|
||||
|
@ -28,6 +28,20 @@ class SentenceExtractorTest {
|
||||
System.out.println(dld);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCplusplus() {
|
||||
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("std::vector", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPHP() {
|
||||
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(1, dld.length());
|
||||
assertEquals("$_get", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPolishArtist() {
|
||||
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||
|
Loading…
Reference in New Issue
Block a user