diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java index 0644cf76..ce7fb3b4 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentPositionMapper.java @@ -152,7 +152,10 @@ public class DocumentPositionMapper { } boolean matchesWordPattern(String s) { - // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + if (s.length() > 48) + return false; + + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,8} String wordPartSeparator = ".-_/:+*"; @@ -169,7 +172,7 @@ public class DocumentPositionMapper { if (i == 0) return false; - for (int j = 0; j < 5; j++) { + for (int j = 0; j < 8; j++) { if (i == s.length()) return true; if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java index a00dd3ae..533b0993 100644 --- a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentPositionMapperTest.java @@ -30,9 +30,11 @@ class DocumentPositionMapperTest { Assertions.assertFalse(positionMapper.matchesWordPattern("1234567890abcdef")); Assertions.assertTrue(positionMapper.matchesWordPattern("test-test-test-test-test")); - Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertFalse(positionMapper.matchesWordPattern("test-test-test-test-test-test-test-test-test")); Assertions.assertTrue(positionMapper.matchesWordPattern("192.168.1.100/24")); Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector")); + Assertions.assertTrue(positionMapper.matchesWordPattern("std::vector::push_back")); + Assertions.assertTrue(positionMapper.matchesWordPattern("c++")); Assertions.assertTrue(positionMapper.matchesWordPattern("m*a*s*h")); Assertions.assertFalse(positionMapper.matchesWordPattern("Stulpnagelstrasse"));