From 22b35d5d91379da8038fcea0afad4da900b38691 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 18 Jul 2024 15:57:48 +0200 Subject: [PATCH] (sentence-extractor) Add tag information to document language data Decorates DocumentSentences with information about which HTML tags they are nested in, and removes some redundant data on this rather memory hungry object. Separator information is encoded as a bit set instead of an array of integers. The change also cleans up the SentenceExtractor class a fair bit. It no longer extracts ngrams, and a significant amount of redundant operations were removed as well. This is still a pretty unpleasant class to work in, but this is the first step in making it a little bit better. --- .../marginalia/atags/AnchorTextKeywords.java | 3 +- .../extractor/TermFrequencyExporter.java | 5 - .../keyword/DocumentKeywordExtractor.java | 9 - .../marginalia/keyword/KeywordExtractor.java | 39 +- .../keyword/extractors/NameLikeKeywords.java | 7 +- .../extractors/SubjectLikeKeywords.java | 4 +- .../keyword/extractors/TitleKeywords.java | 7 +- .../keyword/SentenceExtractorTest.java | 10 +- .../keyword/extractors/TitleKeywordsTest.java | 4 +- .../query_parser/QueryTokenizer.java | 26 +- .../language/model/DocumentLanguageData.java | 38 +- .../language/model/DocumentSentence.java | 70 ++-- .../language/model/WordSeparator.java | 6 - .../language/sentence/SentenceExtractor.java | 388 +++++++++--------- .../SentenceExtractorHtmlTagCleaner.java | 40 -- .../SentenceExtractorStringUtils.java | 93 ----- .../language/sentence/SentencePreCleaner.java | 6 +- .../sentence/SentenceSegmentSplitter.java | 36 +- .../sentence/tag/HtmlStringTagger.java | 122 ++++++ .../language/sentence/tag/HtmlTag.java | 21 + .../sentence/tag/HtmlTaggedString.java | 33 ++ .../SentenceExtractorHtmlTagCleanerTest.java | 28 -- .../sentence/SentenceExtractorTest.java | 23 +- .../sentence/tag/HtmlStringTaggerTest.java | 29 ++ .../processor/logic/TitleExtractor.java | 4 - 25 files changed, 551 insertions(+), 500 deletions(-) delete mode 100644 code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java delete mode 100644 code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java delete mode 100644 code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java create mode 100644 code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java create mode 100644 code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java create mode 100644 code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java delete mode 100644 code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java create mode 100644 code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 95e37836..4b9ce5fb 100644 --- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import java.io.BufferedReader; @@ -55,7 +56,7 @@ public class AnchorTextKeywords { if (stopList.contains(keyword.text().toLowerCase())) continue; - var sentence = sentenceExtractor.extractSentence(keyword.text()); + var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.noneOf(HtmlTag.class)); for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) { wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum); } diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 3bcc9cf2..998e94a4 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -27,7 +27,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.nio.file.attribute.PosixFilePermissions; -import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -124,10 +123,6 @@ public class TermFrequencyExporter implements ExporterIf { for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } - - for (var ngram : sent.ngramStemmed) { - words.add(longHash(ngram.getBytes())); - } } synchronized (counts) { diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index facb601f..ebaa76f5 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -134,15 +134,6 @@ public class DocumentKeywordExtractor { wordsBuilder.addMeta(rep.word, meta); } - for (int i = 0; i < sent.ngrams.length; i++) { - var ngram = sent.ngrams[i]; - var ngramStemmed = sent.ngramStemmed[i]; - - long meta = metadata.getMetadataForWord(ngramStemmed); - - wordsBuilder.addMeta(ngram, meta); - } - } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java index e1990618..babd44d7 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java @@ -3,7 +3,6 @@ package nu.marginalia.keyword; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; @@ -20,15 +19,15 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence)) spans.add(new WordSpan(i-1, i+1)); } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) @@ -37,9 +36,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) @@ -66,7 +65,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isNoun(i, sentence) && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) { @@ -75,8 +74,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if ((isNoun(i, sentence)) && (isJoiner(sentence, i-1) || isNoun(i-1, sentence)) @@ -85,9 +84,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) { if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence)) @@ -119,7 +118,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isName(i, sentence)) { if (isName(i - 1, sentence) || isTopAdj(i-1, sentence)) @@ -131,8 +130,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } if (isName(i, sentence)) { if ((isName(i-1, sentence) || isTopAdj(i-1, sentence)) @@ -149,9 +148,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } if (isName(i, sentence) && (isName(i-1, sentence) || isTopAdj(i-1, sentence)) && @@ -217,7 +216,7 @@ public class KeywordExtractor { private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) { for (int i = w.start; i < w.end-1; i++) { - if (sentence.separators[i] == WordSeparator.COMMA) { + if (sentence.isSeparatorComma(i)) { return false; } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java index c033bdc1..9b2d8b85 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java @@ -1,13 +1,12 @@ package nu.marginalia.keyword.extractors; -import com.google.common.base.CharMatcher; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword.KeywordExtractor; import java.util.*; import java.util.stream.Collectors; @@ -21,13 +20,11 @@ public class NameLikeKeywords implements WordReps { Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(1000); HashMap> instances = new HashMap<>(1000); - final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase); - for (int i = 0; i < dld.sentences.length; i++) { DocumentSentence sent = dld.sentences[i]; var keywords = keywordExtractor.getProperNames(sent); for (var span : keywords) { - if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start])) + if (span.size() <= 1 && sent.isAllCaps(span.start)) continue; var stemmed = sent.constructStemmedWordFromSpan(span); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java index d4a6e428..95dbf5bc 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java @@ -6,7 +6,6 @@ import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import org.apache.commons.lang3.StringUtils; import java.util.*; @@ -36,8 +35,7 @@ public class SubjectLikeKeywords implements WordReps { if (kw.end + 2 >= sentence.length()) { continue; } - if (sentence.separators[kw.end] == WordSeparator.COMMA - || sentence.separators[kw.end + 1] == WordSeparator.COMMA) + if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1)) continue; String nextTag = sentence.posTags[kw.end]; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java index e1c7eceb..846225c2 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java @@ -1,11 +1,11 @@ package nu.marginalia.keyword.extractors; -import nu.marginalia.keyword.WordReps; import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; -import java.util.Arrays; import java.util.Collection; import java.util.Set; import java.util.stream.Collectors; @@ -16,7 +16,8 @@ public class TitleKeywords implements WordReps { private final Set stemmed; public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) { - titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> + titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream() + .flatMap(sent -> keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) .limit(100) .collect(Collectors.toSet()); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index 34b1b7af..fe868e68 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -2,11 +2,11 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.test.util.TestLanguageModels; import org.jsoup.Jsoup; import org.junit.jupiter.api.Tag; @@ -59,8 +59,8 @@ class SentenceExtractorTest { @Test public void testACDC() { - var ret = se.extractSentence("AC/DC is a rock band."); - assertEquals("AC/DC", ret.words[0]); + var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class)); + assertEquals("ac/dc", ret.wordsLowerCase[0]); } final Pattern p = Pattern.compile("([, ]+)"); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java index cac29c73..49a555de 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java @@ -190,7 +190,9 @@ class TitleKeywordsTest { public void extractTitleWords() { var se = new SentenceExtractor(TestLanguageModels.getLanguageModels()); - var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps(); + var dld = se.extractSentences(Jsoup.parse(document)); + + var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps(); var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet()); Set expected = Set.of( diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index 80f05808..79179524 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -2,10 +2,10 @@ package nu.marginalia.functions.searchquery.query_parser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.sentence.SentenceExtractorStringUtils; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.regex.Pattern; public class QueryTokenizer { @@ -55,7 +55,7 @@ public class QueryTokenizer { } String displayStr = query.substring(i, end); - String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr); + String str = toLowerCaseStripPossessive(displayStr); tokens.add(new QueryToken.LiteralTerm(str, displayStr)); @@ -65,5 +65,27 @@ public class QueryTokenizer { return tokens; } + public static String toLowerCaseStripPossessive(String word) { + String val = stripPossessive(word).toLowerCase(); + if (Objects.equals(val, word)) { + return word; + } + + return val; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java index 2ad53f7a..99cdadeb 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -1,49 +1,41 @@ package nu.marginalia.language.model; -import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.lsh.EasyLSH; import java.util.Arrays; -import java.util.stream.Stream; +import java.util.List; -/** +/** Holds the sentences and text of a document, decorated with + * HTML tags, POS tags, and other information. + * * @see SentenceExtractor */ -@AllArgsConstructor public class DocumentLanguageData { public final DocumentSentence[] sentences; - public final DocumentSentence[] titleSentences; - public final TObjectIntHashMap wordCount; public final String text; - /** for test convenience */ - public static DocumentLanguageData empty() { - return new DocumentLanguageData( - new DocumentSentence[0], - new DocumentSentence[0], - new TObjectIntHashMap<>(), - "" - ); + public DocumentLanguageData(List sentences, + String text) { + this.sentences = sentences.toArray(DocumentSentence[]::new); + this.text = text; + } + + public List findSentencesForTag(HtmlTag tag) { + return Arrays.stream(sentences).filter(s -> s.htmlTags.contains(tag)).toList(); } public int totalNumWords() { int ret = 0; + for (int i = 0; i < sentences.length; i++) { ret += sentences[i].length(); } + return ret; } - public Stream streamLowerCase() { - return Arrays.stream(sentences).map(sent -> sent.wordsLowerCase).flatMap(Arrays::stream); - } - - public Stream stream() { - return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream); - } - public long localitySensitiveHashCode() { var hash = new EasyLSH(); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index b9b4abce..4bd6ae1b 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -2,52 +2,55 @@ package nu.marginalia.language.model; import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; import java.util.BitSet; +import java.util.EnumSet; import java.util.Iterator; import java.util.StringJoiner; public class DocumentSentence implements Iterable{ - public final String originalSentence; - public final String[] words; - public final int[] separators; + + /** A span of words in a sentence */ + public final String[] wordsLowerCase; public final String[] posTags; public final String[] stemmedWords; - public final String[] ngrams; - public final String[] ngramStemmed; + + public final EnumSet htmlTags; private final BitSet isStopWord; + private final BitSet separators; + private final BitSet isCapitalized; + private final BitSet isAllCaps; + public SoftReference keywords; - public DocumentSentence(String originalSentence, - String[] words, - int[] separators, + public DocumentSentence(BitSet separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords, - String[] ngrams, - String[] ngramsStemmed + EnumSet htmlTags, + BitSet isCapitalized, + BitSet isAllCaps ) { - this.originalSentence = originalSentence; - this.words = words; this.separators = separators; this.wordsLowerCase = wordsLowerCase; this.posTags = posTags; this.stemmedWords = stemmedWords; + this.htmlTags = htmlTags; + this.isCapitalized = isCapitalized; + this.isAllCaps = isAllCaps; - isStopWord = new BitSet(words.length); + isStopWord = new BitSet(wordsLowerCase.length); - this.ngrams = ngrams; - this.ngramStemmed = ngramsStemmed; - - for (int i = 0; i < words.length; i++) { - if (WordPatterns.isStopWord(words[i])) + for (int i = 0; i < wordsLowerCase.length; i++) { + if (WordPatterns.isStopWord(wordsLowerCase[i])) isStopWord.set(i); } } @@ -55,14 +58,22 @@ public class DocumentSentence implements Iterable{ public boolean isStopWord(int idx) { return isStopWord.get(idx); } - public void setIsStopWord(int idx, boolean val) { - if (val) - isStopWord.set(idx); - else - isStopWord.clear(); - } + public int length() { - return words.length; + return wordsLowerCase.length; + } + + public boolean isCapitalized(int i) { + return isCapitalized.get(i); + } + public boolean isAllCaps(int i) { + return isAllCaps.get(i); + } + public boolean isSeparatorSpace(int i) { + return separators.get(i); + } + public boolean isSeparatorComma(int i) { + return !separators.get(i); } public String constructWordFromSpan(WordSpan span) { @@ -140,9 +151,9 @@ public class DocumentSentence implements Iterable{ @Override public String toString() { StringBuilder sb = new StringBuilder(); - for (int i = 0; i < words.length; i++) { - sb.append(words[i]).append('[').append(posTags[i]).append(']'); - if (separators[i] == WordSeparator.COMMA) { + for (int i = 0; i < wordsLowerCase.length; i++) { + sb.append(wordsLowerCase[i]).append('[').append(posTags[i]).append(']'); + if (isSeparatorComma(i)) { sb.append(','); } else { @@ -176,11 +187,10 @@ public class DocumentSentence implements Iterable{ this.pos = pos; } - public String word() { return words[pos]; } + public String word() { return wordsLowerCase[pos]; } public String wordLowerCase() { return wordsLowerCase[pos]; } public String posTag() { return posTags[pos]; } public String stemmed() { return stemmedWords[pos]; } - public int separator() { return separators[pos]; } public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } public WordRep rep() { diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java b/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java deleted file mode 100644 index 3476073f..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.language.model; - -public final class WordSeparator { - public static final int COMMA = 0; - public static final int SPACE = 1; -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 8dd818a3..48d709f3 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -1,23 +1,23 @@ package nu.marginalia.language.sentence; import com.github.datquocnguyen.RDRPOSTagger; -import gnu.trove.map.hash.TObjectIntHashMap; +import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.sentence.tag.HtmlStringTagger; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.language.sentence.tag.HtmlTaggedString; +import nu.marginalia.segmentation.NgramLexicon; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -38,14 +38,13 @@ public class SentenceExtractor { private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); - private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner(); /* Truncate sentences longer than this. This is mostly a defense measure against malformed data * that might otherwise use an undue amount of processing power. 250 words is about 10X longer than * this comment. */ - private static final int MAX_SENTENCE_LENGTH = 250; - private static final int MAX_TEXT_LENGTH = 65536; + static final int MAX_SENTENCE_LENGTH = 250; + static final int MAX_SENTENCE_COUNT = 1000; @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) @@ -75,219 +74,224 @@ public class SentenceExtractor { } + + public DocumentLanguageData extractSentences(Document doc) { - var clone = doc.clone(); - tagCleaner.clean(clone); - final String text = asText(clone); - final DocumentSentence[] textSentences = extractSentencesFromString(text); + final List taggedStrings = HtmlStringTagger.tagDocumentStrings(doc); + final List textSentences = new ArrayList<>(); - String title = getTitle(clone, textSentences); + final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum(); + final StringBuilder documentText = new StringBuilder(totalTextLength + taggedStrings.size()); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + for (var taggedString : taggedStrings) { + String text = taggedString.string(); + + textSentences.addAll( + extractSentencesFromString(text, taggedString.tags()) + ); + + if (documentText.isEmpty()) { + documentText.append(text); + } + else { + documentText.append(' ').append(text); + } + } + + return new DocumentLanguageData(textSentences, documentText.toString()); } public DocumentLanguageData extractSentences(String text, String title) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); + var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class)); + var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE)); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + List combined = new ArrayList<>(textSentences.size() + titleSentences.size()); + combined.addAll(titleSentences); + combined.addAll(textSentences); + + return new DocumentLanguageData( + combined, + text); } - private String getTitle(Document doc, DocumentSentence[] textSentences) { - String title = doc.getElementsByTag("title").text() + " . " + - Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); + public DocumentSentence extractSentence(String text, EnumSet htmlTags) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH); - if (title.trim().length() < 3) { - title = doc.getElementsByTag("h2").text(); - } + String[] words = wordsAndSeps.words(); + BitSet seps = wordsAndSeps.separators(); + String[] lc = new String[words.length]; + String[] stemmed = new String[words.length]; - if (title.trim().length() < 3) { - for (DocumentSentence textSentence : textSentences) { - if (textSentence.length() > 0) { - title = textSentence.originalSentence.toLowerCase(); - break; - } + BitSet isCapitalized = new BitSet(words.length); + BitSet isAllCaps = new BitSet(words.length); + + for (int i = 0; i < words.length; i++) { + lc[i] = stripPossessive(words[i].toLowerCase()); + + if (words[i].length() > 0 && Character.isUpperCase(words[i].charAt(0))) { + isCapitalized.set(i); } - } - - return title; - } - - - @NotNull - private TObjectIntHashMap calculateWordCounts(DocumentSentence[] textSentences) { - TObjectIntHashMap counts = new TObjectIntHashMap<>(textSentences.length*10, 0.5f, 0); - - for (var sent : textSentences) { - for (var word : sent.stemmedWords) { - counts.adjustOrPutValue(word, 1, 1); - } - } - return counts; - } - - public DocumentSentence extractSentence(String text) { - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text); - - var words = wordsAndSeps.words; - var seps = wordsAndSeps.separators; - var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - for (int i = 0; i < ngrams.size(); i++) { - String[] ngram = ngrams.get(i); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); + if (StringUtils.isAllUpperCase(words[i])) { + isAllCaps.set(i); } - ngramsWords[i] = ngramJoiner.toString(); - ngramsStemmedWords[i] = stemmedJoiner.toString(); - } - - - return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), - words, - seps, - lc, - rdrposTagger.tagsForEnSentence(words), - stemSentence(lc), - ngramsWords, - ngramsStemmedWords - ); - } - - public DocumentSentence[] extractSentencesFromString(String text) { - String[] sentences; - - String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - - try { - sentences = sentenceDetector.sentDetect(textNormalizedSpaces); - } - catch (Exception ex) { - // shitty fallback logic - sentences = StringUtils.split(textNormalizedSpaces, '.'); - } - - sentences = sentencePrecleaner.clean(sentences); - - final String[][] tokens = new String[sentences.length][]; - final int[][] separators = new int[sentences.length][]; - final String[][] posTags = new String[sentences.length][]; - final String[][] tokensLc = new String[sentences.length][]; - final String[][] stemmedWords = new String[sentences.length][]; - - for (int i = 0; i < tokens.length; i++) { - - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]); - tokens[i] = wordsAndSeps.words; - separators[i] = wordsAndSeps.separators; - - if (tokens[i].length > MAX_SENTENCE_LENGTH) { - tokens[i] = Arrays.copyOf(tokens[i], MAX_SENTENCE_LENGTH); - separators[i] = Arrays.copyOf(separators[i], MAX_SENTENCE_LENGTH); - } - - for (int j = 0; j < tokens[i].length; j++) { - while (tokens[i][j].endsWith(".")) { - tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); - } - } - } - - for (int i = 0; i < tokens.length; i++) { - posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - stemmedWords[i] = stemSentence(tokensLc[i]); - } - - DocumentSentence[] ret = new DocumentSentence[sentences.length]; - for (int i = 0; i < ret.length; i++) { - String fullString; - - if (i == 0) { - fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]); - } - else { - fullString = ""; - } - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - - for (int j = 0; j < ngrams.size(); j++) { - String[] ngram = ngrams.get(j); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); - } - - ngramsWords[j] = ngramJoiner.toString(); - ngramsStemmedWords[j] = stemmedJoiner.toString(); - } - - - ret[i] = new DocumentSentence(fullString, - tokens[i], - separators[i], - tokensLc[i], - posTags[i], - stemmedWords[i], - ngramsWords, - ngramsStemmedWords - ); - } - return ret; - } - - private String[] stemSentence(String[] strings) { - String[] stemmed = new String[strings.length]; - for (int i = 0; i < stemmed.length; i++) { - var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]); try { - stemmed[i] = porterStemmer.stem(sent); + stemmed[i] = porterStemmer.stem(lc[i]); } catch (Exception ex) { stemmed[i] = "NN"; // ??? } } - return stemmed; + + return new DocumentSentence( + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemmed, + htmlTags, + isCapitalized, + isAllCaps + ); } - public String asText(Document dc) { - String text = dc.getElementsByTag("body").text(); + public List extractSentencesFromString(String text, EnumSet htmlTags) { + String[] sentences; - if (text.length() > MAX_TEXT_LENGTH) { - return text.substring(0, MAX_TEXT_LENGTH); + // Normalize spaces + + text = normalizeSpaces(text); + + // Split into sentences + + try { + sentences = sentenceDetector.sentDetect(text); + } + catch (Exception ex) { + // shitty fallback logic + sentences = StringUtils.split(text, '.'); + } + + sentences = sentencePrecleaner.clean(sentences); + + // Truncate the number of sentences if it exceeds the maximum, to avoid + // excessive processing time on malformed data + + if (sentences.length > MAX_SENTENCE_COUNT) { + sentences = Arrays.copyOf(sentences, MAX_SENTENCE_COUNT); + } + + final boolean isNaturalLanguage = htmlTags.stream().noneMatch(tag -> tag.nonLanguage); + + List ret = new ArrayList<>(sentences.length); + + if (isNaturalLanguage) { + // Natural language text; do POS tagging and stemming + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = rdrposTagger.tagsForEnSentence(tokens); + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokens.length; i++) { + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + var originalVal = tokens[i]; + var newVal = stripPossessive(originalVal.toLowerCase()); + + if (Objects.equals(originalVal, newVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = newVal; + } + + try { + stemmed[i] = porterStemmer.stem(tokens[i]); + } + catch (Exception ex) { + stemmed[i] = "NN"; // ??? + } + } + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps)); + } } else { - return text.substring(0, (int) (text.length() * 0.95)); + // non-language text, e.g. program code; don't bother with POS tagging or stemming + // as this is not likely to be useful + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = new String[tokens.length]; + Arrays.fill(posTags, "X"); // Placeholder POS tag + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokensLc.length; i++) { + var originalVal = tokens[i]; + + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + if (StringUtils.isAllLowerCase(originalVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = originalVal.toLowerCase(); + } + stemmed[i] = tokensLc[i]; // we don't stem non-language words + } + + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized)); + } + } + + + return ret; } + public static String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java deleted file mode 100644 index 63cd12e7..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.language.sentence; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.TextNode; - -import java.util.regex.Pattern; - -public class SentenceExtractorHtmlTagCleaner { - public final int MAX_CODE_TAG_LENGTH = 32; - public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|<|>|<|>|\\([^)]*\\)[;]?$)"); - - public void clean(Document doc) { - cleanCodeTags(doc); - - doc.select("nav,form,input,code,body>title").remove(); - - // Create "sentences" out of elements that sometimes lack a period at the end to help - // NLP work better - doc.select("li,h1,h2,h3,h4,h5,h6,td,th,p,div,title").forEach(e -> e.appendText(". ")); - doc.select("br,hr").forEach(e -> e.prependText(". ")); - } - - private void cleanCodeTags(Document doc) { - for (var codeTag : doc.getElementsByTag("code")) { - var text = codeTag.text(); - - if (text.length() <= MAX_CODE_TAG_LENGTH) { - codeTag.replaceWith(new TextNode(trimCodeTagContents(text))); - } - else { - codeTag.remove(); - } - - } - } - - private String trimCodeTagContents(String text) { - return codeTagJunkPattern.matcher(text).replaceAll(" "); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java deleted file mode 100644 index 41f27c24..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.language.sentence; - -import java.util.Arrays; -import java.util.Objects; - -public class SentenceExtractorStringUtils { - - public static String sanitizeString(String s) { - char[] newChars = new char[s.length()]; - int pi = 0; - boolean changed = false; - for (int i = 0; i < newChars.length; i++) { - char c = s.charAt(i); - if (!isBadChar(c)) { - newChars[pi++] = c; - } - else { - changed = true; - newChars[pi++] = ' '; - } - } - - if (changed) { - s = new String(newChars, 0, pi); - } - - if (s.startsWith(".")) { - s = s.substring(1); - } - - if (s.isBlank()) { - return ""; - } - - return s; - - } - - private static boolean isBadChar(char c) { - if (c >= 'a' && c <= 'z') return false; - if (c >= 'A' && c <= 'Z') return false; - if (c >= '0' && c <= '9') return false; - if ("_#@.".indexOf(c) >= 0) return false; - if (c >= '\u00C0' && c <= '\u00D6') return false; - if (c >= '\u00D8' && c <= '\u00F6') return false; - if (c >= '\u00F8' && c <= '\u00FF') return false; - - return true; - } - - public static String normalizeSpaces(String s) { - if (s.indexOf('\t') >= 0) { - s = s.replace('\t', ' '); - } - if (s.indexOf('\n') >= 0) { - s = s.replace('\n', ' '); - } - return s; - } - - - public static String toLowerCaseStripPossessive(String word) { - String val = stripPossessive(word).toLowerCase(); - - if (Objects.equals(val, word)) { - return word; - } - - return val; - } - - public static String[] toLowerCaseStripPossessive(String[] words) { - String[] lc = new String[words.length]; - Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i])); - return lc; - } - - public static String stripPossessive(String s) { - int end = s.length(); - - if (s.endsWith("'")) { - return s.substring(0, end-1); - } - - if (s.endsWith("'s") || s.endsWith("'S")) { - return s.substring(0, end-2); - } - - return s; - } - - -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java index c8d7ec39..4fbcd061 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java @@ -7,12 +7,9 @@ import java.util.regex.Pattern; public class SentencePreCleaner { private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - private final int maxSentenceCount = 250; - private final int maxTotalLength = 20 * maxSentenceCount; public String[] clean(String[] sentences) { - int totalLength = 0; int sentenceCount = 0; List sentenceList = new ArrayList<>(); @@ -20,10 +17,9 @@ public class SentencePreCleaner { if (s.isBlank()) continue; - totalLength+=s.length(); sentenceCount++; - if (totalLength > maxTotalLength && sentenceCount++ > maxSentenceCount) { + if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) { break; } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 7a0b49be..531f5189 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -2,25 +2,18 @@ package nu.marginalia.language.sentence; import com.google.common.base.CharMatcher; import gnu.trove.list.array.TIntArrayList; -import lombok.AllArgsConstructor; -import lombok.Getter; import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.model.WordSeparator; import java.util.ArrayList; +import java.util.BitSet; import java.util.List; import java.util.regex.Pattern; -import static nu.marginalia.language.WordPatterns.*; +import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH; public class SentenceSegmentSplitter { - @AllArgsConstructor - @Getter - public static class SeparatedSentence { - String[] words; - int[] separators; - } + public record SeparatedSentence(String[] words, BitSet separators) { } private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-"); @@ -43,7 +36,7 @@ public class SentenceSegmentSplitter { * @param segment The sentence to split * @return A list of words and separators */ - public static SeparatedSentence splitSegment(String segment) { + public static SeparatedSentence splitSegment(String segment, int maxLength) { String flatSegment = AsciiFlattener.flattenUnicode(segment); var matcher = wordBreakPattern.matcher(flatSegment); @@ -77,7 +70,7 @@ public class SentenceSegmentSplitter { } List ret = new ArrayList<>(words.size()); - TIntArrayList seps = new TIntArrayList(words.size()); + BitSet seps = new BitSet(separators.size()); String[] parts = words.toArray(String[]::new); for (int i = 0; i < parts.length; i++) { @@ -89,7 +82,9 @@ public class SentenceSegmentSplitter { continue; ret.add(parts[i]); - seps.add(separators.getQuick(i)); + if (separators.getQuick(i) > 0) { + seps.set(i); + } } for (int i = 0; i < ret.size(); i++) { @@ -101,13 +96,26 @@ public class SentenceSegmentSplitter { if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); } + while (part.endsWith(".")) { + part = part.substring(0, part.length()-1); + ret.set(i, part); + } + } + + if (ret.size() > maxLength) { + ret.subList(maxLength, ret.size()).clear(); + seps = seps.get(0, maxLength); } return new SeparatedSentence( ret.toArray(String[]::new), - seps.toArray() + seps ); } + public static final class WordSeparator { + public static final int COMMA = 0; + public static final int SPACE = 1; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java new file mode 100644 index 00000000..2454e889 --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java @@ -0,0 +1,122 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.*; + +/** A class that tags strings in an HTML document with the HTML tags that are active at that point in the document. */ +public class HtmlStringTagger implements NodeVisitor { + private List tagStack = new ArrayList<>(8); + private Set stackTags = new HashSet<>(8); + private StringBuilder currentString = new StringBuilder(256); + + HtmlStringTagger() {} + + public static List tagDocumentStrings(Document document) { + var tagger = new HtmlStringTagger(); + document.traverse(tagger); + return tagger.getOutput(); + } + + private List output = new ArrayList<>(); + + public List getOutput() { + List compactedOutput = new ArrayList<>(output.size()); + + for (var ts : output) { + if (compactedOutput.isEmpty()) { + compactedOutput.add(ts); + } + else { + var last = compactedOutput.getLast(); + if (last.tags().equals(ts.tags())) { + last.append(ts.string()); + } + else { + compactedOutput.add(ts); + } + } + } + + return output; + } + + + @Override + public void head(Node node, int i) { + if (node instanceof Element el) { + String tagName = el.tagName(); + switch (tagName) { + case "script" -> pushTag(HtmlTag.SCRIPT, el); + case "style" -> pushTag(HtmlTag.STYLE, el); + case "code" -> pushTag(HtmlTag.CODE, el); + case "title" -> pushTag(HtmlTag.TITLE, el); + case "nav" -> pushTag(HtmlTag.NAV, el); + case "header" -> pushTag(HtmlTag.HEADER, el); + case "footer" -> pushTag(HtmlTag.FOOTER, el); + case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el); + } + } + else if (node instanceof TextNode tn) { + if (shouldProcess()) { + String tnText = tn.text(); + if (!tnText.isBlank()) { + currentString = currentString.append(' ').append(tnText.trim()); + } + } + } + } + + @Override + public void tail(Node node, int i) { + if (!(node instanceof Element el)) + return; + + if (stackTags.remove(el)) { + output.add(new HtmlTaggedString(currentString, EnumSet.copyOf(tagStack))); + tagStack.removeLast(); + currentString = new StringBuilder(); + } + else if ("#root".equals(el.tagName())) { + closeOngoingTag(); + } + } + + private void pushTag(HtmlTag tag, Element el) { + closeOngoingTag(); + + tagStack.add(tag); + stackTags.add(el); + } + + private void closeOngoingTag() { + if (currentString.isEmpty()) { + return; + } + + EnumSet tags; + if (tagStack.isEmpty()) { + tags = EnumSet.noneOf(HtmlTag.class); + } + else { + tags = EnumSet.copyOf(tagStack); + } + + output.add(new HtmlTaggedString(currentString, tags)); + currentString = new StringBuilder(); + } + + public boolean shouldProcess() { + for (var tag : tagStack) { + if (tag.exclude) { + return false; + } + } + return true; + } + +} \ No newline at end of file diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java new file mode 100644 index 00000000..bc26e93e --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java @@ -0,0 +1,21 @@ +package nu.marginalia.language.sentence.tag; + +public enum HtmlTag { + SCRIPT(true, false), + STYLE(true, false), + CODE(false, true), + PRE(false, true), + TITLE(false, false), + HEADING(false, false), + NAV(false, false), + HEADER(false, false), + FOOTER(false, false); + + public boolean exclude; + public boolean nonLanguage; + + HtmlTag(boolean exclude, boolean nonLanguage) { + this.exclude = exclude; + this.nonLanguage = nonLanguage; + } +} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java new file mode 100644 index 00000000..80e8f4ee --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java @@ -0,0 +1,33 @@ +package nu.marginalia.language.sentence.tag; + +import java.util.EnumSet; + +public class HtmlTaggedString { + private StringBuilder string; + private final EnumSet tags; + + public HtmlTaggedString(StringBuilder string, EnumSet tags) { + this.tags = tags; + this.string = string; + } + + public String string() { + return string.toString(); + } + + public EnumSet tags() { + return tags; + } + + public void append(String s) { + string.append(' ').append(s); + } + + public String toString() { + return "[" + tags.toString() + ":" + string.toString() + "]"; + } + + public int length() { + return string.length(); + } +} diff --git a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java deleted file mode 100644 index dc21d379..00000000 --- a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.language.encoding; - -import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class SentenceExtractorHtmlTagCleanerTest { - - final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); - - public String cleanTag(String text) { - var doc = Jsoup.parse(text); - tagCleaner.clean(doc); - return doc.text(); - } - - @Test - public void testBriefCodeTag() { - assertEquals("hello", cleanTag("hello")); - assertEquals("System out println", cleanTag("System.out.println")); - assertEquals("hello", cleanTag("hello()")); - assertEquals("hello", cleanTag("<hello>")); - assertEquals("hello", cleanTag("hello(p,q)")); - assertEquals("hello", cleanTag("hello(p,q);")); - } -} \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java index e4679db7..b6918eee 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java @@ -1,14 +1,17 @@ package nu.marginalia.language.sentence; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.EnumSet; import java.util.Objects; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; class SentenceExtractorTest { private static SentenceExtractor sentenceExtractor; @@ -20,26 +23,25 @@ class SentenceExtractorTest { @Test void testParen() { - var dld = sentenceExtractor.extractSentence("I am (very) tall"); + var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class)); System.out.println(dld); } @Test void testPolishArtist() { - var dld = sentenceExtractor.extractSentence("Uklański"); + var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class)); - assertEquals(1, dld.words.length); - assertEquals("Uklanski", dld.words[0]); + assertEquals(1, dld.wordsLowerCase.length); assertEquals("uklanski", dld.wordsLowerCase[0]); } @Test void testJava() { - var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API"); + var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class)); - assertEquals(4, dld.words.length); - assertArrayEquals(new String[] {"Foreign", "Function", "Memory", "API"}, dld.words); + assertEquals(4, dld.wordsLowerCase.length); + assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase); } @Test @@ -77,10 +79,9 @@ class SentenceExtractorTest { } @Test void testApostrophe() { - var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun"); - assertEquals(7, dld.words.length); + var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class)); + assertEquals(7, dld.wordsLowerCase.length); - assertArrayEquals(new String[] { "duke", "nuke", "em's", "big", "ol", "big", "gun"}, dld.words); assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase); } } \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java new file mode 100644 index 00000000..d550ee1e --- /dev/null +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java @@ -0,0 +1,29 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +class HtmlStringTaggerTest { + @Test + public void test() { + String html = """ + + + + T Example + + +

H1 Example

+

This is an example.

+

Here is more text.

+

And more text with a link and more text.

+ #include <stdlib.h> +

Good bye

+ + """; + var visitor = new HtmlStringTagger(); + Jsoup.parse(html).traverse(visitor); + + visitor.getOutput().forEach(ts -> System.out.println(ts.string() + " " + ts.tags())); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java index 920da41c..b5570b86 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java @@ -39,10 +39,6 @@ public class TitleExtractor { title = getFirstTagText(doc, "h5"); if (title != null) return title; - if (dld.sentences.length > 0) { - return dld.sentences[0].originalSentence; - } - return url; }