Merge branch 'master' into serp-redesign

This commit is contained in:
Viktor Lofgren 2025-01-05 21:10:20 +01:00
commit 9ec9d1b338
2 changed files with 20 additions and 1 deletions

View File

@ -27,7 +27,7 @@ public class SentenceSegmentSplitter {
else { else {
// If we flatten unicode, we do this... // If we flatten unicode, we do this...
// FIXME: This can almost definitely be cleaned up and simplified. // FIXME: This can almost definitely be cleaned up and simplified.
wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
} }
} }
@ -90,12 +90,17 @@ public class SentenceSegmentSplitter {
for (int i = 0; i < ret.size(); i++) { for (int i = 0; i < ret.size(); i++) {
String part = ret.get(i); String part = ret.get(i);
if (part.startsWith("<") && part.endsWith(">") && part.length() > 2) {
ret.set(i, part.substring(1, part.length() - 1));
}
if (part.startsWith("'") && part.length() > 1) { if (part.startsWith("'") && part.length() > 1) {
ret.set(i, part.substring(1)); ret.set(i, part.substring(1));
} }
if (part.endsWith("'") && part.length() > 1) { if (part.endsWith("'") && part.length() > 1) {
ret.set(i, part.substring(0, part.length()-1)); ret.set(i, part.substring(0, part.length()-1));
} }
while (part.endsWith(".")) { while (part.endsWith(".")) {
part = part.substring(0, part.length()-1); part = part.substring(0, part.length()-1);
ret.set(i, part); ret.set(i, part);

View File

@ -28,6 +28,20 @@ class SentenceExtractorTest {
System.out.println(dld); System.out.println(dld);
} }
@Test
void testCplusplus() {
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("std::vector", dld.wordsLowerCase[0]);
}
@Test
void testPHP() {
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.length());
assertEquals("$_get", dld.wordsLowerCase[0]);
}
@Test @Test
void testPolishArtist() { void testPolishArtist() {
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class)); var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));