mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search) Adjust token formation rules to be more lenient to C++ and PHP code.
This addresses Issue #142
This commit is contained in:
parent
9b2ceaf37c
commit
b62f043910
@ -27,7 +27,7 @@ public class SentenceSegmentSplitter {
|
|||||||
else {
|
else {
|
||||||
// If we flatten unicode, we do this...
|
// If we flatten unicode, we do this...
|
||||||
// FIXME: This can almost definitely be cleaned up and simplified.
|
// FIXME: This can almost definitely be cleaned up and simplified.
|
||||||
wordBreakPattern = Pattern.compile("([^/_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
wordBreakPattern = Pattern.compile("([^/<>$:_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,6 +28,20 @@ class SentenceExtractorTest {
|
|||||||
System.out.println(dld);
|
System.out.println(dld);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testCplusplus() {
|
||||||
|
var dld = sentenceExtractor.extractSentence("std::vector", EnumSet.noneOf(HtmlTag.class));
|
||||||
|
assertEquals(1, dld.length());
|
||||||
|
assertEquals("std::vector", dld.wordsLowerCase[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testPHP() {
|
||||||
|
var dld = sentenceExtractor.extractSentence("$_GET", EnumSet.noneOf(HtmlTag.class));
|
||||||
|
assertEquals(1, dld.length());
|
||||||
|
assertEquals("$_get", dld.wordsLowerCase[0]);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testPolishArtist() {
|
void testPolishArtist() {
|
||||||
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||||
|
Loading…
Reference in New Issue
Block a user