diff --git a/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java new file mode 100644 index 00000000..47d2dc6f --- /dev/null +++ b/code/features-convert/summary-extraction/src/main/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java @@ -0,0 +1,87 @@ +package nu.marginalia.summary.heuristic; + +import org.apache.commons.lang3.StringUtils; + +import java.util.Collection; + +public class HeuristicTextUtil { + + /** Return the number of occurrences of any word in the set of words in the text. + * + * The words must be all lower case, the text may be in any case. To count as a match, + * the word must be surrounded by non-alphabetic characters. + * + */ + public static int countOccurrencesOfAnyWord(String text, Collection wordsLc) { + if (StringUtils.isAllLowerCase(text)) { + return countOccurrencesOfAnyWordLowerCase(text, wordsLc); + } + + int cnt = 0; + for (var word : wordsLc) { + if (containsWordInAnyCase(text, word)) { + cnt++; + } + } + return cnt; + } + + public static boolean containsWordInAnyCase(String text, String wordLowerCase) { + int pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase); + int wl = wordLowerCase.length(); + + while (pos >= 0) { + if (pos > 0) { + char c = text.charAt(pos - 1); + if (Character.isAlphabetic(c) || Character.isDigit(c)) { + pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1); + continue; + } + } + if (pos + wl < text.length()) { + char c = text.charAt(pos + wl); + if (Character.isAlphabetic(c) || Character.isDigit(c)) { + pos = StringUtils.indexOfIgnoreCase(text, wordLowerCase, pos + 1); + continue; + } + } + return true; + } + return false; + } + + public static int countOccurrencesOfAnyWordLowerCase(String textLc, Collection wordsLc) { + int cnt = 0; + for (var word : wordsLc) { + if (containsWordAllLowerCase(textLc, word)) { + cnt++; + } + } + return cnt; + } + + public static boolean containsWordAllLowerCase(String text, String wordLowerCase) { + int pos = text.indexOf(wordLowerCase); + int wl = wordLowerCase.length(); + + while (pos >= 0) { + if (pos > 0) { + char c = text.charAt(pos - 1); + if (Character.isAlphabetic(c) || Character.isDigit(c)) { + pos = text.indexOf(wordLowerCase, pos + 1); + continue; + } + } + if (pos + wl < text.length()) { + char c = text.charAt(pos + wl); + if (Character.isAlphabetic(c) || Character.isDigit(c)) { + pos = text.indexOf(wordLowerCase, pos + 1); + continue; + } + } + return true; + } + return false; + } + +} diff --git a/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java new file mode 100644 index 00000000..a97c1a5d --- /dev/null +++ b/code/features-convert/summary-extraction/src/test/java/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java @@ -0,0 +1,39 @@ +package nu.marginalia.summary.heuristic; + +import org.junit.jupiter.api.Test; + +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; + +class HeuristicTextUtilTest { + + @Test + void countOccurrencesOfAnyWord() { + String sentence = "B A Baracus was an expert with the Abacus"; + assertEquals(4, HeuristicTextUtil.countOccurrencesOfAnyWord(sentence, Set.of("b", "a", "baracus", "abacus"))); + } + + @Test + void containsWordInAnyCase() { + String sentence = "B A Baracus was an expert with the Abacus"; + + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus")); + assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus")); + } + + @Test + void containsWordAllLowerCase() { + String sentence = "b a baracus was an expert with the abacus"; + + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "b")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "a")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "baracus")); + assertTrue(HeuristicTextUtil.containsWordInAnyCase(sentence, "abacus")); + assertFalse(HeuristicTextUtil.containsWordInAnyCase(sentence, "cus")); + } + +} \ No newline at end of file