diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 95e37836..4b9ce5fb 100644 --- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import java.io.BufferedReader; @@ -55,7 +56,7 @@ public class AnchorTextKeywords { if (stopList.contains(keyword.text().toLowerCase())) continue; - var sentence = sentenceExtractor.extractSentence(keyword.text()); + var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.noneOf(HtmlTag.class)); for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) { wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum); } diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 3bcc9cf2..998e94a4 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -27,7 +27,6 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.nio.file.attribute.PosixFilePermissions; -import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -124,10 +123,6 @@ public class TermFrequencyExporter implements ExporterIf { for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } - - for (var ngram : sent.ngramStemmed) { - words.add(longHash(ngram.getBytes())); - } } synchronized (counts) { diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index facb601f..ebaa76f5 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -134,15 +134,6 @@ public class DocumentKeywordExtractor { wordsBuilder.addMeta(rep.word, meta); } - for (int i = 0; i < sent.ngrams.length; i++) { - var ngram = sent.ngrams[i]; - var ngramStemmed = sent.ngramStemmed[i]; - - long meta = metadata.getMetadataForWord(ngramStemmed); - - wordsBuilder.addMeta(ngram, meta); - } - } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java index e1990618..babd44d7 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java @@ -3,7 +3,6 @@ package nu.marginalia.keyword; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; @@ -20,15 +19,15 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence)) spans.add(new WordSpan(i-1, i+1)); } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) @@ -37,9 +36,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) @@ -66,7 +65,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isNoun(i, sentence) && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) { @@ -75,8 +74,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if ((isNoun(i, sentence)) && (isJoiner(sentence, i-1) || isNoun(i-1, sentence)) @@ -85,9 +84,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) { if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence)) @@ -119,7 +118,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isName(i, sentence)) { if (isName(i - 1, sentence) || isTopAdj(i-1, sentence)) @@ -131,8 +130,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } if (isName(i, sentence)) { if ((isName(i-1, sentence) || isTopAdj(i-1, sentence)) @@ -149,9 +148,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } if (isName(i, sentence) && (isName(i-1, sentence) || isTopAdj(i-1, sentence)) && @@ -217,7 +216,7 @@ public class KeywordExtractor { private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) { for (int i = w.start; i < w.end-1; i++) { - if (sentence.separators[i] == WordSeparator.COMMA) { + if (sentence.isSeparatorComma(i)) { return false; } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java index c033bdc1..9b2d8b85 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java @@ -1,13 +1,12 @@ package nu.marginalia.keyword.extractors; -import com.google.common.base.CharMatcher; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword.KeywordExtractor; import java.util.*; import java.util.stream.Collectors; @@ -21,13 +20,11 @@ public class NameLikeKeywords implements WordReps { Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(1000); HashMap> instances = new HashMap<>(1000); - final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase); - for (int i = 0; i < dld.sentences.length; i++) { DocumentSentence sent = dld.sentences[i]; var keywords = keywordExtractor.getProperNames(sent); for (var span : keywords) { - if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start])) + if (span.size() <= 1 && sent.isAllCaps(span.start)) continue; var stemmed = sent.constructStemmedWordFromSpan(span); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java index d4a6e428..95dbf5bc 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java @@ -6,7 +6,6 @@ import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import org.apache.commons.lang3.StringUtils; import java.util.*; @@ -36,8 +35,7 @@ public class SubjectLikeKeywords implements WordReps { if (kw.end + 2 >= sentence.length()) { continue; } - if (sentence.separators[kw.end] == WordSeparator.COMMA - || sentence.separators[kw.end + 1] == WordSeparator.COMMA) + if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1)) continue; String nextTag = sentence.posTags[kw.end]; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java index e1c7eceb..846225c2 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java @@ -1,11 +1,11 @@ package nu.marginalia.keyword.extractors; -import nu.marginalia.keyword.WordReps; import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; -import java.util.Arrays; import java.util.Collection; import java.util.Set; import java.util.stream.Collectors; @@ -16,7 +16,8 @@ public class TitleKeywords implements WordReps { private final Set stemmed; public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) { - titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> + titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream() + .flatMap(sent -> keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) .limit(100) .collect(Collectors.toSet()); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index 34b1b7af..fe868e68 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -2,11 +2,11 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.test.util.TestLanguageModels; import org.jsoup.Jsoup; import org.junit.jupiter.api.Tag; @@ -59,8 +59,8 @@ class SentenceExtractorTest { @Test public void testACDC() { - var ret = se.extractSentence("AC/DC is a rock band."); - assertEquals("AC/DC", ret.words[0]); + var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class)); + assertEquals("ac/dc", ret.wordsLowerCase[0]); } final Pattern p = Pattern.compile("([, ]+)"); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java index cac29c73..49a555de 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java @@ -190,7 +190,9 @@ class TitleKeywordsTest { public void extractTitleWords() { var se = new SentenceExtractor(TestLanguageModels.getLanguageModels()); - var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps(); + var dld = se.extractSentences(Jsoup.parse(document)); + + var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps(); var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet()); Set expected = Set.of( diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index 80f05808..79179524 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -2,10 +2,10 @@ package nu.marginalia.functions.searchquery.query_parser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.sentence.SentenceExtractorStringUtils; import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.regex.Pattern; public class QueryTokenizer { @@ -55,7 +55,7 @@ public class QueryTokenizer { } String displayStr = query.substring(i, end); - String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr); + String str = toLowerCaseStripPossessive(displayStr); tokens.add(new QueryToken.LiteralTerm(str, displayStr)); @@ -65,5 +65,27 @@ public class QueryTokenizer { return tokens; } + public static String toLowerCaseStripPossessive(String word) { + String val = stripPossessive(word).toLowerCase(); + if (Objects.equals(val, word)) { + return word; + } + + return val; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java index 2ad53f7a..99cdadeb 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -1,49 +1,41 @@ package nu.marginalia.language.model; -import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.lsh.EasyLSH; import java.util.Arrays; -import java.util.stream.Stream; +import java.util.List; -/** +/** Holds the sentences and text of a document, decorated with + * HTML tags, POS tags, and other information. + * * @see SentenceExtractor */ -@AllArgsConstructor public class DocumentLanguageData { public final DocumentSentence[] sentences; - public final DocumentSentence[] titleSentences; - public final TObjectIntHashMap wordCount; public final String text; - /** for test convenience */ - public static DocumentLanguageData empty() { - return new DocumentLanguageData( - new DocumentSentence[0], - new DocumentSentence[0], - new TObjectIntHashMap<>(), - "" - ); + public DocumentLanguageData(List sentences, + String text) { + this.sentences = sentences.toArray(DocumentSentence[]::new); + this.text = text; + } + + public List findSentencesForTag(HtmlTag tag) { + return Arrays.stream(sentences).filter(s -> s.htmlTags.contains(tag)).toList(); } public int totalNumWords() { int ret = 0; + for (int i = 0; i < sentences.length; i++) { ret += sentences[i].length(); } + return ret; } - public Stream streamLowerCase() { - return Arrays.stream(sentences).map(sent -> sent.wordsLowerCase).flatMap(Arrays::stream); - } - - public Stream stream() { - return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream); - } - public long localitySensitiveHashCode() { var hash = new EasyLSH(); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index b9b4abce..4bd6ae1b 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -2,52 +2,55 @@ package nu.marginalia.language.model; import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; import java.util.BitSet; +import java.util.EnumSet; import java.util.Iterator; import java.util.StringJoiner; public class DocumentSentence implements Iterable{ - public final String originalSentence; - public final String[] words; - public final int[] separators; + + /** A span of words in a sentence */ + public final String[] wordsLowerCase; public final String[] posTags; public final String[] stemmedWords; - public final String[] ngrams; - public final String[] ngramStemmed; + + public final EnumSet htmlTags; private final BitSet isStopWord; + private final BitSet separators; + private final BitSet isCapitalized; + private final BitSet isAllCaps; + public SoftReference keywords; - public DocumentSentence(String originalSentence, - String[] words, - int[] separators, + public DocumentSentence(BitSet separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords, - String[] ngrams, - String[] ngramsStemmed + EnumSet htmlTags, + BitSet isCapitalized, + BitSet isAllCaps ) { - this.originalSentence = originalSentence; - this.words = words; this.separators = separators; this.wordsLowerCase = wordsLowerCase; this.posTags = posTags; this.stemmedWords = stemmedWords; + this.htmlTags = htmlTags; + this.isCapitalized = isCapitalized; + this.isAllCaps = isAllCaps; - isStopWord = new BitSet(words.length); + isStopWord = new BitSet(wordsLowerCase.length); - this.ngrams = ngrams; - this.ngramStemmed = ngramsStemmed; - - for (int i = 0; i < words.length; i++) { - if (WordPatterns.isStopWord(words[i])) + for (int i = 0; i < wordsLowerCase.length; i++) { + if (WordPatterns.isStopWord(wordsLowerCase[i])) isStopWord.set(i); } } @@ -55,14 +58,22 @@ public class DocumentSentence implements Iterable{ public boolean isStopWord(int idx) { return isStopWord.get(idx); } - public void setIsStopWord(int idx, boolean val) { - if (val) - isStopWord.set(idx); - else - isStopWord.clear(); - } + public int length() { - return words.length; + return wordsLowerCase.length; + } + + public boolean isCapitalized(int i) { + return isCapitalized.get(i); + } + public boolean isAllCaps(int i) { + return isAllCaps.get(i); + } + public boolean isSeparatorSpace(int i) { + return separators.get(i); + } + public boolean isSeparatorComma(int i) { + return !separators.get(i); } public String constructWordFromSpan(WordSpan span) { @@ -140,9 +151,9 @@ public class DocumentSentence implements Iterable{ @Override public String toString() { StringBuilder sb = new StringBuilder(); - for (int i = 0; i < words.length; i++) { - sb.append(words[i]).append('[').append(posTags[i]).append(']'); - if (separators[i] == WordSeparator.COMMA) { + for (int i = 0; i < wordsLowerCase.length; i++) { + sb.append(wordsLowerCase[i]).append('[').append(posTags[i]).append(']'); + if (isSeparatorComma(i)) { sb.append(','); } else { @@ -176,11 +187,10 @@ public class DocumentSentence implements Iterable{ this.pos = pos; } - public String word() { return words[pos]; } + public String word() { return wordsLowerCase[pos]; } public String wordLowerCase() { return wordsLowerCase[pos]; } public String posTag() { return posTags[pos]; } public String stemmed() { return stemmedWords[pos]; } - public int separator() { return separators[pos]; } public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } public WordRep rep() { diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java b/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java deleted file mode 100644 index 3476073f..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.language.model; - -public final class WordSeparator { - public static final int COMMA = 0; - public static final int SPACE = 1; -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 8dd818a3..48d709f3 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -1,23 +1,23 @@ package nu.marginalia.language.sentence; import com.github.datquocnguyen.RDRPOSTagger; -import gnu.trove.map.hash.TObjectIntHashMap; +import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.sentence.tag.HtmlStringTagger; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.language.sentence.tag.HtmlTaggedString; +import nu.marginalia.segmentation.NgramLexicon; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -38,14 +38,13 @@ public class SentenceExtractor { private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); - private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner(); /* Truncate sentences longer than this. This is mostly a defense measure against malformed data * that might otherwise use an undue amount of processing power. 250 words is about 10X longer than * this comment. */ - private static final int MAX_SENTENCE_LENGTH = 250; - private static final int MAX_TEXT_LENGTH = 65536; + static final int MAX_SENTENCE_LENGTH = 250; + static final int MAX_SENTENCE_COUNT = 1000; @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) @@ -75,219 +74,224 @@ public class SentenceExtractor { } + + public DocumentLanguageData extractSentences(Document doc) { - var clone = doc.clone(); - tagCleaner.clean(clone); - final String text = asText(clone); - final DocumentSentence[] textSentences = extractSentencesFromString(text); + final List taggedStrings = HtmlStringTagger.tagDocumentStrings(doc); + final List textSentences = new ArrayList<>(); - String title = getTitle(clone, textSentences); + final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum(); + final StringBuilder documentText = new StringBuilder(totalTextLength + taggedStrings.size()); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + for (var taggedString : taggedStrings) { + String text = taggedString.string(); + + textSentences.addAll( + extractSentencesFromString(text, taggedString.tags()) + ); + + if (documentText.isEmpty()) { + documentText.append(text); + } + else { + documentText.append(' ').append(text); + } + } + + return new DocumentLanguageData(textSentences, documentText.toString()); } public DocumentLanguageData extractSentences(String text, String title) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); + var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class)); + var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE)); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + List combined = new ArrayList<>(textSentences.size() + titleSentences.size()); + combined.addAll(titleSentences); + combined.addAll(textSentences); + + return new DocumentLanguageData( + combined, + text); } - private String getTitle(Document doc, DocumentSentence[] textSentences) { - String title = doc.getElementsByTag("title").text() + " . " + - Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); + public DocumentSentence extractSentence(String text, EnumSet htmlTags) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH); - if (title.trim().length() < 3) { - title = doc.getElementsByTag("h2").text(); - } + String[] words = wordsAndSeps.words(); + BitSet seps = wordsAndSeps.separators(); + String[] lc = new String[words.length]; + String[] stemmed = new String[words.length]; - if (title.trim().length() < 3) { - for (DocumentSentence textSentence : textSentences) { - if (textSentence.length() > 0) { - title = textSentence.originalSentence.toLowerCase(); - break; - } + BitSet isCapitalized = new BitSet(words.length); + BitSet isAllCaps = new BitSet(words.length); + + for (int i = 0; i < words.length; i++) { + lc[i] = stripPossessive(words[i].toLowerCase()); + + if (words[i].length() > 0 && Character.isUpperCase(words[i].charAt(0))) { + isCapitalized.set(i); } - } - - return title; - } - - - @NotNull - private TObjectIntHashMap calculateWordCounts(DocumentSentence[] textSentences) { - TObjectIntHashMap counts = new TObjectIntHashMap<>(textSentences.length*10, 0.5f, 0); - - for (var sent : textSentences) { - for (var word : sent.stemmedWords) { - counts.adjustOrPutValue(word, 1, 1); - } - } - return counts; - } - - public DocumentSentence extractSentence(String text) { - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text); - - var words = wordsAndSeps.words; - var seps = wordsAndSeps.separators; - var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - for (int i = 0; i < ngrams.size(); i++) { - String[] ngram = ngrams.get(i); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); + if (StringUtils.isAllUpperCase(words[i])) { + isAllCaps.set(i); } - ngramsWords[i] = ngramJoiner.toString(); - ngramsStemmedWords[i] = stemmedJoiner.toString(); - } - - - return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), - words, - seps, - lc, - rdrposTagger.tagsForEnSentence(words), - stemSentence(lc), - ngramsWords, - ngramsStemmedWords - ); - } - - public DocumentSentence[] extractSentencesFromString(String text) { - String[] sentences; - - String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - - try { - sentences = sentenceDetector.sentDetect(textNormalizedSpaces); - } - catch (Exception ex) { - // shitty fallback logic - sentences = StringUtils.split(textNormalizedSpaces, '.'); - } - - sentences = sentencePrecleaner.clean(sentences); - - final String[][] tokens = new String[sentences.length][]; - final int[][] separators = new int[sentences.length][]; - final String[][] posTags = new String[sentences.length][]; - final String[][] tokensLc = new String[sentences.length][]; - final String[][] stemmedWords = new String[sentences.length][]; - - for (int i = 0; i < tokens.length; i++) { - - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]); - tokens[i] = wordsAndSeps.words; - separators[i] = wordsAndSeps.separators; - - if (tokens[i].length > MAX_SENTENCE_LENGTH) { - tokens[i] = Arrays.copyOf(tokens[i], MAX_SENTENCE_LENGTH); - separators[i] = Arrays.copyOf(separators[i], MAX_SENTENCE_LENGTH); - } - - for (int j = 0; j < tokens[i].length; j++) { - while (tokens[i][j].endsWith(".")) { - tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); - } - } - } - - for (int i = 0; i < tokens.length; i++) { - posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - stemmedWords[i] = stemSentence(tokensLc[i]); - } - - DocumentSentence[] ret = new DocumentSentence[sentences.length]; - for (int i = 0; i < ret.length; i++) { - String fullString; - - if (i == 0) { - fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]); - } - else { - fullString = ""; - } - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - - for (int j = 0; j < ngrams.size(); j++) { - String[] ngram = ngrams.get(j); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); - } - - ngramsWords[j] = ngramJoiner.toString(); - ngramsStemmedWords[j] = stemmedJoiner.toString(); - } - - - ret[i] = new DocumentSentence(fullString, - tokens[i], - separators[i], - tokensLc[i], - posTags[i], - stemmedWords[i], - ngramsWords, - ngramsStemmedWords - ); - } - return ret; - } - - private String[] stemSentence(String[] strings) { - String[] stemmed = new String[strings.length]; - for (int i = 0; i < stemmed.length; i++) { - var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]); try { - stemmed[i] = porterStemmer.stem(sent); + stemmed[i] = porterStemmer.stem(lc[i]); } catch (Exception ex) { stemmed[i] = "NN"; // ??? } } - return stemmed; + + return new DocumentSentence( + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemmed, + htmlTags, + isCapitalized, + isAllCaps + ); } - public String asText(Document dc) { - String text = dc.getElementsByTag("body").text(); + public List extractSentencesFromString(String text, EnumSet htmlTags) { + String[] sentences; - if (text.length() > MAX_TEXT_LENGTH) { - return text.substring(0, MAX_TEXT_LENGTH); + // Normalize spaces + + text = normalizeSpaces(text); + + // Split into sentences + + try { + sentences = sentenceDetector.sentDetect(text); + } + catch (Exception ex) { + // shitty fallback logic + sentences = StringUtils.split(text, '.'); + } + + sentences = sentencePrecleaner.clean(sentences); + + // Truncate the number of sentences if it exceeds the maximum, to avoid + // excessive processing time on malformed data + + if (sentences.length > MAX_SENTENCE_COUNT) { + sentences = Arrays.copyOf(sentences, MAX_SENTENCE_COUNT); + } + + final boolean isNaturalLanguage = htmlTags.stream().noneMatch(tag -> tag.nonLanguage); + + List ret = new ArrayList<>(sentences.length); + + if (isNaturalLanguage) { + // Natural language text; do POS tagging and stemming + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = rdrposTagger.tagsForEnSentence(tokens); + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokens.length; i++) { + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + var originalVal = tokens[i]; + var newVal = stripPossessive(originalVal.toLowerCase()); + + if (Objects.equals(originalVal, newVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = newVal; + } + + try { + stemmed[i] = porterStemmer.stem(tokens[i]); + } + catch (Exception ex) { + stemmed[i] = "NN"; // ??? + } + } + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps)); + } } else { - return text.substring(0, (int) (text.length() * 0.95)); + // non-language text, e.g. program code; don't bother with POS tagging or stemming + // as this is not likely to be useful + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = new String[tokens.length]; + Arrays.fill(posTags, "X"); // Placeholder POS tag + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokensLc.length; i++) { + var originalVal = tokens[i]; + + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + if (StringUtils.isAllLowerCase(originalVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = originalVal.toLowerCase(); + } + stemmed[i] = tokensLc[i]; // we don't stem non-language words + } + + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized)); + } + } + + + return ret; } + public static String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java deleted file mode 100644 index 63cd12e7..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.language.sentence; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.TextNode; - -import java.util.regex.Pattern; - -public class SentenceExtractorHtmlTagCleaner { - public final int MAX_CODE_TAG_LENGTH = 32; - public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|<|>|<|>|\\([^)]*\\)[;]?$)"); - - public void clean(Document doc) { - cleanCodeTags(doc); - - doc.select("nav,form,input,code,body>title").remove(); - - // Create "sentences" out of elements that sometimes lack a period at the end to help - // NLP work better - doc.select("li,h1,h2,h3,h4,h5,h6,td,th,p,div,title").forEach(e -> e.appendText(". ")); - doc.select("br,hr").forEach(e -> e.prependText(". ")); - } - - private void cleanCodeTags(Document doc) { - for (var codeTag : doc.getElementsByTag("code")) { - var text = codeTag.text(); - - if (text.length() <= MAX_CODE_TAG_LENGTH) { - codeTag.replaceWith(new TextNode(trimCodeTagContents(text))); - } - else { - codeTag.remove(); - } - - } - } - - private String trimCodeTagContents(String text) { - return codeTagJunkPattern.matcher(text).replaceAll(" "); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java deleted file mode 100644 index 41f27c24..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.language.sentence; - -import java.util.Arrays; -import java.util.Objects; - -public class SentenceExtractorStringUtils { - - public static String sanitizeString(String s) { - char[] newChars = new char[s.length()]; - int pi = 0; - boolean changed = false; - for (int i = 0; i < newChars.length; i++) { - char c = s.charAt(i); - if (!isBadChar(c)) { - newChars[pi++] = c; - } - else { - changed = true; - newChars[pi++] = ' '; - } - } - - if (changed) { - s = new String(newChars, 0, pi); - } - - if (s.startsWith(".")) { - s = s.substring(1); - } - - if (s.isBlank()) { - return ""; - } - - return s; - - } - - private static boolean isBadChar(char c) { - if (c >= 'a' && c <= 'z') return false; - if (c >= 'A' && c <= 'Z') return false; - if (c >= '0' && c <= '9') return false; - if ("_#@.".indexOf(c) >= 0) return false; - if (c >= '\u00C0' && c <= '\u00D6') return false; - if (c >= '\u00D8' && c <= '\u00F6') return false; - if (c >= '\u00F8' && c <= '\u00FF') return false; - - return true; - } - - public static String normalizeSpaces(String s) { - if (s.indexOf('\t') >= 0) { - s = s.replace('\t', ' '); - } - if (s.indexOf('\n') >= 0) { - s = s.replace('\n', ' '); - } - return s; - } - - - public static String toLowerCaseStripPossessive(String word) { - String val = stripPossessive(word).toLowerCase(); - - if (Objects.equals(val, word)) { - return word; - } - - return val; - } - - public static String[] toLowerCaseStripPossessive(String[] words) { - String[] lc = new String[words.length]; - Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i])); - return lc; - } - - public static String stripPossessive(String s) { - int end = s.length(); - - if (s.endsWith("'")) { - return s.substring(0, end-1); - } - - if (s.endsWith("'s") || s.endsWith("'S")) { - return s.substring(0, end-2); - } - - return s; - } - - -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java index c8d7ec39..4fbcd061 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java @@ -7,12 +7,9 @@ import java.util.regex.Pattern; public class SentencePreCleaner { private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - private final int maxSentenceCount = 250; - private final int maxTotalLength = 20 * maxSentenceCount; public String[] clean(String[] sentences) { - int totalLength = 0; int sentenceCount = 0; List sentenceList = new ArrayList<>(); @@ -20,10 +17,9 @@ public class SentencePreCleaner { if (s.isBlank()) continue; - totalLength+=s.length(); sentenceCount++; - if (totalLength > maxTotalLength && sentenceCount++ > maxSentenceCount) { + if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) { break; } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 7a0b49be..531f5189 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -2,25 +2,18 @@ package nu.marginalia.language.sentence; import com.google.common.base.CharMatcher; import gnu.trove.list.array.TIntArrayList; -import lombok.AllArgsConstructor; -import lombok.Getter; import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.model.WordSeparator; import java.util.ArrayList; +import java.util.BitSet; import java.util.List; import java.util.regex.Pattern; -import static nu.marginalia.language.WordPatterns.*; +import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH; public class SentenceSegmentSplitter { - @AllArgsConstructor - @Getter - public static class SeparatedSentence { - String[] words; - int[] separators; - } + public record SeparatedSentence(String[] words, BitSet separators) { } private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-"); @@ -43,7 +36,7 @@ public class SentenceSegmentSplitter { * @param segment The sentence to split * @return A list of words and separators */ - public static SeparatedSentence splitSegment(String segment) { + public static SeparatedSentence splitSegment(String segment, int maxLength) { String flatSegment = AsciiFlattener.flattenUnicode(segment); var matcher = wordBreakPattern.matcher(flatSegment); @@ -77,7 +70,7 @@ public class SentenceSegmentSplitter { } List ret = new ArrayList<>(words.size()); - TIntArrayList seps = new TIntArrayList(words.size()); + BitSet seps = new BitSet(separators.size()); String[] parts = words.toArray(String[]::new); for (int i = 0; i < parts.length; i++) { @@ -89,7 +82,9 @@ public class SentenceSegmentSplitter { continue; ret.add(parts[i]); - seps.add(separators.getQuick(i)); + if (separators.getQuick(i) > 0) { + seps.set(i); + } } for (int i = 0; i < ret.size(); i++) { @@ -101,13 +96,26 @@ public class SentenceSegmentSplitter { if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); } + while (part.endsWith(".")) { + part = part.substring(0, part.length()-1); + ret.set(i, part); + } + } + + if (ret.size() > maxLength) { + ret.subList(maxLength, ret.size()).clear(); + seps = seps.get(0, maxLength); } return new SeparatedSentence( ret.toArray(String[]::new), - seps.toArray() + seps ); } + public static final class WordSeparator { + public static final int COMMA = 0; + public static final int SPACE = 1; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java new file mode 100644 index 00000000..2454e889 --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java @@ -0,0 +1,122 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.*; + +/** A class that tags strings in an HTML document with the HTML tags that are active at that point in the document. */ +public class HtmlStringTagger implements NodeVisitor { + private List tagStack = new ArrayList<>(8); + private Set stackTags = new HashSet<>(8); + private StringBuilder currentString = new StringBuilder(256); + + HtmlStringTagger() {} + + public static List tagDocumentStrings(Document document) { + var tagger = new HtmlStringTagger(); + document.traverse(tagger); + return tagger.getOutput(); + } + + private List output = new ArrayList<>(); + + public List getOutput() { + List compactedOutput = new ArrayList<>(output.size()); + + for (var ts : output) { + if (compactedOutput.isEmpty()) { + compactedOutput.add(ts); + } + else { + var last = compactedOutput.getLast(); + if (last.tags().equals(ts.tags())) { + last.append(ts.string()); + } + else { + compactedOutput.add(ts); + } + } + } + + return output; + } + + + @Override + public void head(Node node, int i) { + if (node instanceof Element el) { + String tagName = el.tagName(); + switch (tagName) { + case "script" -> pushTag(HtmlTag.SCRIPT, el); + case "style" -> pushTag(HtmlTag.STYLE, el); + case "code" -> pushTag(HtmlTag.CODE, el); + case "title" -> pushTag(HtmlTag.TITLE, el); + case "nav" -> pushTag(HtmlTag.NAV, el); + case "header" -> pushTag(HtmlTag.HEADER, el); + case "footer" -> pushTag(HtmlTag.FOOTER, el); + case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el); + } + } + else if (node instanceof TextNode tn) { + if (shouldProcess()) { + String tnText = tn.text(); + if (!tnText.isBlank()) { + currentString = currentString.append(' ').append(tnText.trim()); + } + } + } + } + + @Override + public void tail(Node node, int i) { + if (!(node instanceof Element el)) + return; + + if (stackTags.remove(el)) { + output.add(new HtmlTaggedString(currentString, EnumSet.copyOf(tagStack))); + tagStack.removeLast(); + currentString = new StringBuilder(); + } + else if ("#root".equals(el.tagName())) { + closeOngoingTag(); + } + } + + private void pushTag(HtmlTag tag, Element el) { + closeOngoingTag(); + + tagStack.add(tag); + stackTags.add(el); + } + + private void closeOngoingTag() { + if (currentString.isEmpty()) { + return; + } + + EnumSet tags; + if (tagStack.isEmpty()) { + tags = EnumSet.noneOf(HtmlTag.class); + } + else { + tags = EnumSet.copyOf(tagStack); + } + + output.add(new HtmlTaggedString(currentString, tags)); + currentString = new StringBuilder(); + } + + public boolean shouldProcess() { + for (var tag : tagStack) { + if (tag.exclude) { + return false; + } + } + return true; + } + +} \ No newline at end of file diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java new file mode 100644 index 00000000..bc26e93e --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java @@ -0,0 +1,21 @@ +package nu.marginalia.language.sentence.tag; + +public enum HtmlTag { + SCRIPT(true, false), + STYLE(true, false), + CODE(false, true), + PRE(false, true), + TITLE(false, false), + HEADING(false, false), + NAV(false, false), + HEADER(false, false), + FOOTER(false, false); + + public boolean exclude; + public boolean nonLanguage; + + HtmlTag(boolean exclude, boolean nonLanguage) { + this.exclude = exclude; + this.nonLanguage = nonLanguage; + } +} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java new file mode 100644 index 00000000..80e8f4ee --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java @@ -0,0 +1,33 @@ +package nu.marginalia.language.sentence.tag; + +import java.util.EnumSet; + +public class HtmlTaggedString { + private StringBuilder string; + private final EnumSet tags; + + public HtmlTaggedString(StringBuilder string, EnumSet tags) { + this.tags = tags; + this.string = string; + } + + public String string() { + return string.toString(); + } + + public EnumSet tags() { + return tags; + } + + public void append(String s) { + string.append(' ').append(s); + } + + public String toString() { + return "[" + tags.toString() + ":" + string.toString() + "]"; + } + + public int length() { + return string.length(); + } +} diff --git a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java deleted file mode 100644 index dc21d379..00000000 --- a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.language.encoding; - -import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class SentenceExtractorHtmlTagCleanerTest { - - final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); - - public String cleanTag(String text) { - var doc = Jsoup.parse(text); - tagCleaner.clean(doc); - return doc.text(); - } - - @Test - public void testBriefCodeTag() { - assertEquals("hello", cleanTag("hello")); - assertEquals("System out println", cleanTag("System.out.println")); - assertEquals("hello", cleanTag("hello()")); - assertEquals("hello", cleanTag("<hello>")); - assertEquals("hello", cleanTag("hello(p,q)")); - assertEquals("hello", cleanTag("hello(p,q);")); - } -} \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java index e4679db7..b6918eee 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java @@ -1,14 +1,17 @@ package nu.marginalia.language.sentence; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.EnumSet; import java.util.Objects; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; class SentenceExtractorTest { private static SentenceExtractor sentenceExtractor; @@ -20,26 +23,25 @@ class SentenceExtractorTest { @Test void testParen() { - var dld = sentenceExtractor.extractSentence("I am (very) tall"); + var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class)); System.out.println(dld); } @Test void testPolishArtist() { - var dld = sentenceExtractor.extractSentence("Uklański"); + var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class)); - assertEquals(1, dld.words.length); - assertEquals("Uklanski", dld.words[0]); + assertEquals(1, dld.wordsLowerCase.length); assertEquals("uklanski", dld.wordsLowerCase[0]); } @Test void testJava() { - var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API"); + var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class)); - assertEquals(4, dld.words.length); - assertArrayEquals(new String[] {"Foreign", "Function", "Memory", "API"}, dld.words); + assertEquals(4, dld.wordsLowerCase.length); + assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase); } @Test @@ -77,10 +79,9 @@ class SentenceExtractorTest { } @Test void testApostrophe() { - var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun"); - assertEquals(7, dld.words.length); + var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class)); + assertEquals(7, dld.wordsLowerCase.length); - assertArrayEquals(new String[] { "duke", "nuke", "em's", "big", "ol", "big", "gun"}, dld.words); assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase); } } \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java new file mode 100644 index 00000000..d550ee1e --- /dev/null +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java @@ -0,0 +1,29 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +class HtmlStringTaggerTest { + @Test + public void test() { + String html = """ + + + + T Example + + +

H1 Example

+

This is an example.

+

Here is more text.

+

And more text with a link and more text.

+ #include <stdlib.h> +

Good bye

+ + """; + var visitor = new HtmlStringTagger(); + Jsoup.parse(html).traverse(visitor); + + visitor.getOutput().forEach(ts -> System.out.println(ts.string() + " " + ts.tags())); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java index 920da41c..b5570b86 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java @@ -39,10 +39,6 @@ public class TitleExtractor { title = getFirstTagText(doc, "h5"); if (title != null) return title; - if (dld.sentences.length > 0) { - return dld.sentences[0].originalSentence; - } - return url; }