(sentence-extractor) Add tag information to document language data

Decorates DocumentSentences with information about which HTML tags they are nested in, and removes some redundant data on this rather memory hungry object.  Separator information is encoded as a bit set instead of an array of integers.

The change also cleans up the SentenceExtractor class a fair bit.  It no longer extracts ngrams, and a significant amount of redundant operations were removed as well.  This is still a pretty unpleasant class to work in, but this is the first step in making it a little bit better.
This commit is contained in:
Viktor Lofgren 2024-07-18 15:57:48 +02:00
parent d36055a2d0
commit 22b35d5d91
25 changed files with 551 additions and 500 deletions

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import java.io.BufferedReader;
@ -55,7 +56,7 @@ public class AnchorTextKeywords {
if (stopList.contains(keyword.text().toLowerCase()))
continue;
var sentence = sentenceExtractor.extractSentence(keyword.text());
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.noneOf(HtmlTag.class));
for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) {
wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum);
}

View File

@ -27,7 +27,6 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.PosixFilePermissions;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@ -124,10 +123,6 @@ public class TermFrequencyExporter implements ExporterIf {
for (var word : sent) {
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
for (var ngram : sent.ngramStemmed) {
words.add(longHash(ngram.getBytes()));
}
}
synchronized (counts) {

View File

@ -134,15 +134,6 @@ public class DocumentKeywordExtractor {
wordsBuilder.addMeta(rep.word, meta);
}
for (int i = 0; i < sent.ngrams.length; i++) {
var ngram = sent.ngrams[i];
var ngramStemmed = sent.ngramStemmed[i];
long meta = metadata.getMetadataForWord(ngramStemmed);
wordsBuilder.addMeta(ngram, meta);
}
}
}

View File

@ -3,7 +3,6 @@ package nu.marginalia.keyword;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.model.WordSeparator;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
@ -20,15 +19,15 @@ public class KeywordExtractor {
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (sentence.isSeparatorComma(i-1)) { continue; }
if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence))
spans.add(new WordSpan(i-1, i+1));
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (sentence.isSeparatorComma(i-2)) { continue; }
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
if (isProperNoun(i, sentence)
&& (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
@ -37,9 +36,9 @@ public class KeywordExtractor {
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (sentence.isSeparatorComma(i-3)) { continue; }
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence))
@ -66,7 +65,7 @@ public class KeywordExtractor {
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (sentence.isSeparatorComma(i-1)) { continue; }
if (isNoun(i, sentence)
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
@ -75,8 +74,8 @@ public class KeywordExtractor {
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (sentence.isSeparatorComma(i-2)) { continue; }
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
if ((isNoun(i, sentence))
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
@ -85,9 +84,9 @@ public class KeywordExtractor {
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (sentence.isSeparatorComma(i-3)) { continue; }
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
@ -119,7 +118,7 @@ public class KeywordExtractor {
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (sentence.isSeparatorComma(i-1)) { continue; }
if (isName(i, sentence)) {
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
@ -131,8 +130,8 @@ public class KeywordExtractor {
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
if (sentence.isSeparatorComma(i-2)) { continue; }
if (isName(i, sentence)) {
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
@ -149,9 +148,9 @@ public class KeywordExtractor {
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
if (sentence.isSeparatorComma(i-3)) { continue; }
if (isName(i, sentence) &&
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
@ -217,7 +216,7 @@ public class KeywordExtractor {
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
for (int i = w.start; i < w.end-1; i++) {
if (sentence.separators[i] == WordSeparator.COMMA) {
if (sentence.isSeparatorComma(i)) {
return false;
}
}

View File

@ -1,13 +1,12 @@
package nu.marginalia.keyword.extractors;
import com.google.common.base.CharMatcher;
import it.unimi.dsi.fastutil.objects.Object2IntMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.keyword.KeywordExtractor;
import java.util.*;
import java.util.stream.Collectors;
@ -21,13 +20,11 @@ public class NameLikeKeywords implements WordReps {
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase);
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getProperNames(sent);
for (var span : keywords) {
if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start]))
if (span.size() <= 1 && sent.isAllCaps(span.start))
continue;
var stemmed = sent.constructStemmedWordFromSpan(span);

View File

@ -6,7 +6,6 @@ import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.model.WordSpan;
import nu.marginalia.language.model.WordSeparator;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
@ -36,8 +35,7 @@ public class SubjectLikeKeywords implements WordReps {
if (kw.end + 2 >= sentence.length()) {
continue;
}
if (sentence.separators[kw.end] == WordSeparator.COMMA
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
continue;
String nextTag = sentence.posTags[kw.end];

View File

@ -1,11 +1,11 @@
package nu.marginalia.keyword.extractors;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.keyword.WordReps;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.sentence.tag.HtmlTag;
import java.util.Arrays;
import java.util.Collection;
import java.util.Set;
import java.util.stream.Collectors;
@ -16,7 +16,8 @@ public class TitleKeywords implements WordReps {
private final Set<String> stemmed;
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
.flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
.limit(100)
.collect(Collectors.toSet());

View File

@ -2,11 +2,11 @@ package nu.marginalia.keyword;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Tag;
@ -59,8 +59,8 @@ class SentenceExtractorTest {
@Test
public void testACDC() {
var ret = se.extractSentence("AC/DC is a rock band.");
assertEquals("AC/DC", ret.words[0]);
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
assertEquals("ac/dc", ret.wordsLowerCase[0]);
}
final Pattern p = Pattern.compile("([, ]+)");

View File

@ -190,7 +190,9 @@ class TitleKeywordsTest {
public void extractTitleWords() {
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps();
var dld = se.extractSentences(Jsoup.parse(document));
var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps();
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
Set<String> expected = Set.of(

View File

@ -2,10 +2,10 @@ package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.sentence.SentenceExtractorStringUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;
public class QueryTokenizer {
@ -55,7 +55,7 @@ public class QueryTokenizer {
}
String displayStr = query.substring(i, end);
String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr);
String str = toLowerCaseStripPossessive(displayStr);
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
@ -65,5 +65,27 @@ public class QueryTokenizer {
return tokens;
}
public static String toLowerCaseStripPossessive(String word) {
String val = stripPossessive(word).toLowerCase();
if (Objects.equals(val, word)) {
return word;
}
return val;
}
public static String stripPossessive(String s) {
int end = s.length();
if (s.endsWith("'")) {
return s.substring(0, end-1);
}
if (s.endsWith("'s") || s.endsWith("'S")) {
return s.substring(0, end-2);
}
return s;
}
}

View File

@ -1,49 +1,41 @@
package nu.marginalia.language.model;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.lsh.EasyLSH;
import java.util.Arrays;
import java.util.stream.Stream;
import java.util.List;
/**
/** Holds the sentences and text of a document, decorated with
* HTML tags, POS tags, and other information.
*
* @see SentenceExtractor
*/
@AllArgsConstructor
public class DocumentLanguageData {
public final DocumentSentence[] sentences;
public final DocumentSentence[] titleSentences;
public final TObjectIntHashMap<String> wordCount;
public final String text;
/** for test convenience */
public static DocumentLanguageData empty() {
return new DocumentLanguageData(
new DocumentSentence[0],
new DocumentSentence[0],
new TObjectIntHashMap<>(),
""
);
public DocumentLanguageData(List<DocumentSentence> sentences,
String text) {
this.sentences = sentences.toArray(DocumentSentence[]::new);
this.text = text;
}
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
return Arrays.stream(sentences).filter(s -> s.htmlTags.contains(tag)).toList();
}
public int totalNumWords() {
int ret = 0;
for (int i = 0; i < sentences.length; i++) {
ret += sentences[i].length();
}
return ret;
}
public Stream<String> streamLowerCase() {
return Arrays.stream(sentences).map(sent -> sent.wordsLowerCase).flatMap(Arrays::stream);
}
public Stream<String> stream() {
return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream);
}
public long localitySensitiveHashCode() {
var hash = new EasyLSH();

View File

@ -2,52 +2,55 @@ package nu.marginalia.language.model;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.jetbrains.annotations.NotNull;
import java.lang.ref.SoftReference;
import java.util.BitSet;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.StringJoiner;
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public final String originalSentence;
public final String[] words;
public final int[] separators;
/** A span of words in a sentence */
public final String[] wordsLowerCase;
public final String[] posTags;
public final String[] stemmedWords;
public final String[] ngrams;
public final String[] ngramStemmed;
public final EnumSet<HtmlTag> htmlTags;
private final BitSet isStopWord;
private final BitSet separators;
private final BitSet isCapitalized;
private final BitSet isAllCaps;
public SoftReference<WordSpan[]> keywords;
public DocumentSentence(String originalSentence,
String[] words,
int[] separators,
public DocumentSentence(BitSet separators,
String[] wordsLowerCase,
String[] posTags,
String[] stemmedWords,
String[] ngrams,
String[] ngramsStemmed
EnumSet<HtmlTag> htmlTags,
BitSet isCapitalized,
BitSet isAllCaps
)
{
this.originalSentence = originalSentence;
this.words = words;
this.separators = separators;
this.wordsLowerCase = wordsLowerCase;
this.posTags = posTags;
this.stemmedWords = stemmedWords;
this.htmlTags = htmlTags;
this.isCapitalized = isCapitalized;
this.isAllCaps = isAllCaps;
isStopWord = new BitSet(words.length);
isStopWord = new BitSet(wordsLowerCase.length);
this.ngrams = ngrams;
this.ngramStemmed = ngramsStemmed;
for (int i = 0; i < words.length; i++) {
if (WordPatterns.isStopWord(words[i]))
for (int i = 0; i < wordsLowerCase.length; i++) {
if (WordPatterns.isStopWord(wordsLowerCase[i]))
isStopWord.set(i);
}
}
@ -55,14 +58,22 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public boolean isStopWord(int idx) {
return isStopWord.get(idx);
}
public void setIsStopWord(int idx, boolean val) {
if (val)
isStopWord.set(idx);
else
isStopWord.clear();
}
public int length() {
return words.length;
return wordsLowerCase.length;
}
public boolean isCapitalized(int i) {
return isCapitalized.get(i);
}
public boolean isAllCaps(int i) {
return isAllCaps.get(i);
}
public boolean isSeparatorSpace(int i) {
return separators.get(i);
}
public boolean isSeparatorComma(int i) {
return !separators.get(i);
}
public String constructWordFromSpan(WordSpan span) {
@ -140,9 +151,9 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < words.length; i++) {
sb.append(words[i]).append('[').append(posTags[i]).append(']');
if (separators[i] == WordSeparator.COMMA) {
for (int i = 0; i < wordsLowerCase.length; i++) {
sb.append(wordsLowerCase[i]).append('[').append(posTags[i]).append(']');
if (isSeparatorComma(i)) {
sb.append(',');
}
else {
@ -176,11 +187,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
this.pos = pos;
}
public String word() { return words[pos]; }
public String word() { return wordsLowerCase[pos]; }
public String wordLowerCase() { return wordsLowerCase[pos]; }
public String posTag() { return posTags[pos]; }
public String stemmed() { return stemmedWords[pos]; }
public int separator() { return separators[pos]; }
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
public WordRep rep() {

View File

@ -1,6 +0,0 @@
package nu.marginalia.language.model;
public final class WordSeparator {
public static final int COMMA = 0;
public static final int SPACE = 1;
}

View File

@ -1,23 +1,23 @@
package nu.marginalia.language.sentence;
import com.github.datquocnguyen.RDRPOSTagger;
import gnu.trove.map.hash.TObjectIntHashMap;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
import nu.marginalia.segmentation.NgramLexicon;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.NotNull;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
@ -38,14 +38,13 @@ public class SentenceExtractor {
private final PorterStemmer porterStemmer = new PorterStemmer();
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner();
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
/* Truncate sentences longer than this. This is mostly a defense measure against malformed data
* that might otherwise use an undue amount of processing power. 250 words is about 10X longer than
* this comment. */
private static final int MAX_SENTENCE_LENGTH = 250;
private static final int MAX_TEXT_LENGTH = 65536;
static final int MAX_SENTENCE_LENGTH = 250;
static final int MAX_SENTENCE_COUNT = 1000;
@SneakyThrows @Inject
public SentenceExtractor(LanguageModels models)
@ -75,219 +74,224 @@ public class SentenceExtractor {
}
public DocumentLanguageData extractSentences(Document doc) {
var clone = doc.clone();
tagCleaner.clean(clone);
final String text = asText(clone);
final DocumentSentence[] textSentences = extractSentencesFromString(text);
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
final List<DocumentSentence> textSentences = new ArrayList<>();
String title = getTitle(clone, textSentences);
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
final StringBuilder documentText = new StringBuilder(totalTextLength + taggedStrings.size());
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
for (var taggedString : taggedStrings) {
String text = taggedString.string();
textSentences.addAll(
extractSentencesFromString(text, taggedString.tags())
);
if (documentText.isEmpty()) {
documentText.append(text);
}
else {
documentText.append(' ').append(text);
}
}
return new DocumentLanguageData(textSentences, documentText.toString());
}
public DocumentLanguageData extractSentences(String text, String title) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
combined.addAll(titleSentences);
combined.addAll(textSentences);
return new DocumentLanguageData(
combined,
text);
}
private String getTitle(Document doc, DocumentSentence[] textSentences) {
String title = doc.getElementsByTag("title").text() + " . " +
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
if (title.trim().length() < 3) {
title = doc.getElementsByTag("h2").text();
}
String[] words = wordsAndSeps.words();
BitSet seps = wordsAndSeps.separators();
String[] lc = new String[words.length];
String[] stemmed = new String[words.length];
if (title.trim().length() < 3) {
for (DocumentSentence textSentence : textSentences) {
if (textSentence.length() > 0) {
title = textSentence.originalSentence.toLowerCase();
break;
}
BitSet isCapitalized = new BitSet(words.length);
BitSet isAllCaps = new BitSet(words.length);
for (int i = 0; i < words.length; i++) {
lc[i] = stripPossessive(words[i].toLowerCase());
if (words[i].length() > 0 && Character.isUpperCase(words[i].charAt(0))) {
isCapitalized.set(i);
}
}
return title;
}
@NotNull
private TObjectIntHashMap<String> calculateWordCounts(DocumentSentence[] textSentences) {
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(textSentences.length*10, 0.5f, 0);
for (var sent : textSentences) {
for (var word : sent.stemmedWords) {
counts.adjustOrPutValue(word, 1, 1);
}
}
return counts;
}
public DocumentSentence extractSentence(String text) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
var words = wordsAndSeps.words;
var seps = wordsAndSeps.separators;
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, words);
String[] ngramsWords = new String[ngrams.size()];
String[] ngramsStemmedWords = new String[ngrams.size()];
for (int i = 0; i < ngrams.size(); i++) {
String[] ngram = ngrams.get(i);
StringJoiner ngramJoiner = new StringJoiner("_");
StringJoiner stemmedJoiner = new StringJoiner("_");
for (String s : ngram) {
ngramJoiner.add(s);
stemmedJoiner.add(porterStemmer.stem(s));
if (StringUtils.isAllUpperCase(words[i])) {
isAllCaps.set(i);
}
ngramsWords[i] = ngramJoiner.toString();
ngramsStemmedWords[i] = stemmedJoiner.toString();
}
return new DocumentSentence(
SentenceExtractorStringUtils.sanitizeString(text),
words,
seps,
lc,
rdrposTagger.tagsForEnSentence(words),
stemSentence(lc),
ngramsWords,
ngramsStemmedWords
);
}
public DocumentSentence[] extractSentencesFromString(String text) {
String[] sentences;
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
catch (Exception ex) {
// shitty fallback logic
sentences = StringUtils.split(textNormalizedSpaces, '.');
}
sentences = sentencePrecleaner.clean(sentences);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
final String[][] posTags = new String[sentences.length][];
final String[][] tokensLc = new String[sentences.length][];
final String[][] stemmedWords = new String[sentences.length][];
for (int i = 0; i < tokens.length; i++) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
tokens[i] = wordsAndSeps.words;
separators[i] = wordsAndSeps.separators;
if (tokens[i].length > MAX_SENTENCE_LENGTH) {
tokens[i] = Arrays.copyOf(tokens[i], MAX_SENTENCE_LENGTH);
separators[i] = Arrays.copyOf(separators[i], MAX_SENTENCE_LENGTH);
}
for (int j = 0; j < tokens[i].length; j++) {
while (tokens[i][j].endsWith(".")) {
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
}
}
}
for (int i = 0; i < tokens.length; i++) {
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
stemmedWords[i] = stemSentence(tokensLc[i]);
}
DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) {
String fullString;
if (i == 0) {
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
}
else {
fullString = "";
}
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]);
String[] ngramsWords = new String[ngrams.size()];
String[] ngramsStemmedWords = new String[ngrams.size()];
for (int j = 0; j < ngrams.size(); j++) {
String[] ngram = ngrams.get(j);
StringJoiner ngramJoiner = new StringJoiner("_");
StringJoiner stemmedJoiner = new StringJoiner("_");
for (String s : ngram) {
ngramJoiner.add(s);
stemmedJoiner.add(porterStemmer.stem(s));
}
ngramsWords[j] = ngramJoiner.toString();
ngramsStemmedWords[j] = stemmedJoiner.toString();
}
ret[i] = new DocumentSentence(fullString,
tokens[i],
separators[i],
tokensLc[i],
posTags[i],
stemmedWords[i],
ngramsWords,
ngramsStemmedWords
);
}
return ret;
}
private String[] stemSentence(String[] strings) {
String[] stemmed = new String[strings.length];
for (int i = 0; i < stemmed.length; i++) {
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
try {
stemmed[i] = porterStemmer.stem(sent);
stemmed[i] = porterStemmer.stem(lc[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???
}
}
return stemmed;
return new DocumentSentence(
seps,
lc,
rdrposTagger.tagsForEnSentence(words),
stemmed,
htmlTags,
isCapitalized,
isAllCaps
);
}
public String asText(Document dc) {
String text = dc.getElementsByTag("body").text();
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
String[] sentences;
if (text.length() > MAX_TEXT_LENGTH) {
return text.substring(0, MAX_TEXT_LENGTH);
// Normalize spaces
text = normalizeSpaces(text);
// Split into sentences
try {
sentences = sentenceDetector.sentDetect(text);
}
catch (Exception ex) {
// shitty fallback logic
sentences = StringUtils.split(text, '.');
}
sentences = sentencePrecleaner.clean(sentences);
// Truncate the number of sentences if it exceeds the maximum, to avoid
// excessive processing time on malformed data
if (sentences.length > MAX_SENTENCE_COUNT) {
sentences = Arrays.copyOf(sentences, MAX_SENTENCE_COUNT);
}
final boolean isNaturalLanguage = htmlTags.stream().noneMatch(tag -> tag.nonLanguage);
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
if (isNaturalLanguage) {
// Natural language text; do POS tagging and stemming
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = rdrposTagger.tagsForEnSentence(tokens);
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];
BitSet isCapitalized = new BitSet(tokens.length);
BitSet isAllCaps = new BitSet(tokens.length);
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
isCapitalized.set(i);
}
if (StringUtils.isAllUpperCase(tokens[i])) {
isAllCaps.set(i);
}
var originalVal = tokens[i];
var newVal = stripPossessive(originalVal.toLowerCase());
if (Objects.equals(originalVal, newVal)) {
tokensLc[i] = originalVal;
} else {
tokensLc[i] = newVal;
}
try {
stemmed[i] = porterStemmer.stem(tokens[i]);
}
catch (Exception ex) {
stemmed[i] = "NN"; // ???
}
}
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps));
}
}
else {
return text.substring(0, (int) (text.length() * 0.95));
// non-language text, e.g. program code; don't bother with POS tagging or stemming
// as this is not likely to be useful
for (String sent : sentences) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
var tokens = wordsAndSeps.words();
var separators = wordsAndSeps.separators();
var posTags = new String[tokens.length];
Arrays.fill(posTags, "X"); // Placeholder POS tag
var tokensLc = new String[tokens.length];
var stemmed = new String[tokens.length];
BitSet isCapitalized = new BitSet(tokens.length);
BitSet isAllCaps = new BitSet(tokens.length);
for (int i = 0; i < tokensLc.length; i++) {
var originalVal = tokens[i];
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
isCapitalized.set(i);
}
if (StringUtils.isAllUpperCase(tokens[i])) {
isAllCaps.set(i);
}
if (StringUtils.isAllLowerCase(originalVal)) {
tokensLc[i] = originalVal;
} else {
tokensLc[i] = originalVal.toLowerCase();
}
stemmed[i] = tokensLc[i]; // we don't stem non-language words
}
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized));
}
}
return ret;
}
public static String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public static String stripPossessive(String s) {
int end = s.length();
if (s.endsWith("'")) {
return s.substring(0, end-1);
}
if (s.endsWith("'s") || s.endsWith("'S")) {
return s.substring(0, end-2);
}
return s;
}
}

View File

@ -1,40 +0,0 @@
package nu.marginalia.language.sentence;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.TextNode;
import java.util.regex.Pattern;
public class SentenceExtractorHtmlTagCleaner {
public final int MAX_CODE_TAG_LENGTH = 32;
public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|&lt;|&gt;|<|>|\\([^)]*\\)[;]?$)");
public void clean(Document doc) {
cleanCodeTags(doc);
doc.select("nav,form,input,code,body>title").remove();
// Create "sentences" out of elements that sometimes lack a period at the end to help
// NLP work better
doc.select("li,h1,h2,h3,h4,h5,h6,td,th,p,div,title").forEach(e -> e.appendText(". "));
doc.select("br,hr").forEach(e -> e.prependText(". "));
}
private void cleanCodeTags(Document doc) {
for (var codeTag : doc.getElementsByTag("code")) {
var text = codeTag.text();
if (text.length() <= MAX_CODE_TAG_LENGTH) {
codeTag.replaceWith(new TextNode(trimCodeTagContents(text)));
}
else {
codeTag.remove();
}
}
}
private String trimCodeTagContents(String text) {
return codeTagJunkPattern.matcher(text).replaceAll(" ");
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.language.sentence;
import java.util.Arrays;
import java.util.Objects;
public class SentenceExtractorStringUtils {
public static String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
boolean changed = false;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
changed = true;
newChars[pi++] = ' ';
}
}
if (changed) {
s = new String(newChars, 0, pi);
}
if (s.startsWith(".")) {
s = s.substring(1);
}
if (s.isBlank()) {
return "";
}
return s;
}
private static boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
public static String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public static String toLowerCaseStripPossessive(String word) {
String val = stripPossessive(word).toLowerCase();
if (Objects.equals(val, word)) {
return word;
}
return val;
}
public static String[] toLowerCaseStripPossessive(String[] words) {
String[] lc = new String[words.length];
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
return lc;
}
public static String stripPossessive(String s) {
int end = s.length();
if (s.endsWith("'")) {
return s.substring(0, end-1);
}
if (s.endsWith("'s") || s.endsWith("'S")) {
return s.substring(0, end-2);
}
return s;
}
}

View File

@ -7,12 +7,9 @@ import java.util.regex.Pattern;
public class SentencePreCleaner {
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private final int maxSentenceCount = 250;
private final int maxTotalLength = 20 * maxSentenceCount;
public String[] clean(String[] sentences) {
int totalLength = 0;
int sentenceCount = 0;
List<String> sentenceList = new ArrayList<>();
@ -20,10 +17,9 @@ public class SentencePreCleaner {
if (s.isBlank()) continue;
totalLength+=s.length();
sentenceCount++;
if (totalLength > maxTotalLength && sentenceCount++ > maxSentenceCount) {
if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) {
break;
}

View File

@ -2,25 +2,18 @@ package nu.marginalia.language.sentence;
import com.google.common.base.CharMatcher;
import gnu.trove.list.array.TIntArrayList;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.model.WordSeparator;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import java.util.regex.Pattern;
import static nu.marginalia.language.WordPatterns.*;
import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
public class SentenceSegmentSplitter {
@AllArgsConstructor
@Getter
public static class SeparatedSentence {
String[] words;
int[] separators;
}
public record SeparatedSentence(String[] words, BitSet separators) { }
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
@ -43,7 +36,7 @@ public class SentenceSegmentSplitter {
* @param segment The sentence to split
* @return A list of words and separators
*/
public static SeparatedSentence splitSegment(String segment) {
public static SeparatedSentence splitSegment(String segment, int maxLength) {
String flatSegment = AsciiFlattener.flattenUnicode(segment);
var matcher = wordBreakPattern.matcher(flatSegment);
@ -77,7 +70,7 @@ public class SentenceSegmentSplitter {
}
List<String> ret = new ArrayList<>(words.size());
TIntArrayList seps = new TIntArrayList(words.size());
BitSet seps = new BitSet(separators.size());
String[] parts = words.toArray(String[]::new);
for (int i = 0; i < parts.length; i++) {
@ -89,7 +82,9 @@ public class SentenceSegmentSplitter {
continue;
ret.add(parts[i]);
seps.add(separators.getQuick(i));
if (separators.getQuick(i) > 0) {
seps.set(i);
}
}
for (int i = 0; i < ret.size(); i++) {
@ -101,13 +96,26 @@ public class SentenceSegmentSplitter {
if (part.endsWith("'") && part.length() > 1) {
ret.set(i, part.substring(0, part.length()-1));
}
while (part.endsWith(".")) {
part = part.substring(0, part.length()-1);
ret.set(i, part);
}
}
if (ret.size() > maxLength) {
ret.subList(maxLength, ret.size()).clear();
seps = seps.get(0, maxLength);
}
return new SeparatedSentence(
ret.toArray(String[]::new),
seps.toArray()
seps
);
}
public static final class WordSeparator {
public static final int COMMA = 0;
public static final int SPACE = 1;
}
}

View File

@ -0,0 +1,122 @@
package nu.marginalia.language.sentence.tag;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
import java.util.*;
/** A class that tags strings in an HTML document with the HTML tags that are active at that point in the document. */
public class HtmlStringTagger implements NodeVisitor {
private List<HtmlTag> tagStack = new ArrayList<>(8);
private Set<Element> stackTags = new HashSet<>(8);
private StringBuilder currentString = new StringBuilder(256);
HtmlStringTagger() {}
public static List<HtmlTaggedString> tagDocumentStrings(Document document) {
var tagger = new HtmlStringTagger();
document.traverse(tagger);
return tagger.getOutput();
}
private List<HtmlTaggedString> output = new ArrayList<>();
public List<HtmlTaggedString> getOutput() {
List<HtmlTaggedString> compactedOutput = new ArrayList<>(output.size());
for (var ts : output) {
if (compactedOutput.isEmpty()) {
compactedOutput.add(ts);
}
else {
var last = compactedOutput.getLast();
if (last.tags().equals(ts.tags())) {
last.append(ts.string());
}
else {
compactedOutput.add(ts);
}
}
}
return output;
}
@Override
public void head(Node node, int i) {
if (node instanceof Element el) {
String tagName = el.tagName();
switch (tagName) {
case "script" -> pushTag(HtmlTag.SCRIPT, el);
case "style" -> pushTag(HtmlTag.STYLE, el);
case "code" -> pushTag(HtmlTag.CODE, el);
case "title" -> pushTag(HtmlTag.TITLE, el);
case "nav" -> pushTag(HtmlTag.NAV, el);
case "header" -> pushTag(HtmlTag.HEADER, el);
case "footer" -> pushTag(HtmlTag.FOOTER, el);
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
}
}
else if (node instanceof TextNode tn) {
if (shouldProcess()) {
String tnText = tn.text();
if (!tnText.isBlank()) {
currentString = currentString.append(' ').append(tnText.trim());
}
}
}
}
@Override
public void tail(Node node, int i) {
if (!(node instanceof Element el))
return;
if (stackTags.remove(el)) {
output.add(new HtmlTaggedString(currentString, EnumSet.copyOf(tagStack)));
tagStack.removeLast();
currentString = new StringBuilder();
}
else if ("#root".equals(el.tagName())) {
closeOngoingTag();
}
}
private void pushTag(HtmlTag tag, Element el) {
closeOngoingTag();
tagStack.add(tag);
stackTags.add(el);
}
private void closeOngoingTag() {
if (currentString.isEmpty()) {
return;
}
EnumSet<HtmlTag> tags;
if (tagStack.isEmpty()) {
tags = EnumSet.noneOf(HtmlTag.class);
}
else {
tags = EnumSet.copyOf(tagStack);
}
output.add(new HtmlTaggedString(currentString, tags));
currentString = new StringBuilder();
}
public boolean shouldProcess() {
for (var tag : tagStack) {
if (tag.exclude) {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.language.sentence.tag;
public enum HtmlTag {
SCRIPT(true, false),
STYLE(true, false),
CODE(false, true),
PRE(false, true),
TITLE(false, false),
HEADING(false, false),
NAV(false, false),
HEADER(false, false),
FOOTER(false, false);
public boolean exclude;
public boolean nonLanguage;
HtmlTag(boolean exclude, boolean nonLanguage) {
this.exclude = exclude;
this.nonLanguage = nonLanguage;
}
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.language.sentence.tag;
import java.util.EnumSet;
public class HtmlTaggedString {
private StringBuilder string;
private final EnumSet<HtmlTag> tags;
public HtmlTaggedString(StringBuilder string, EnumSet<HtmlTag> tags) {
this.tags = tags;
this.string = string;
}
public String string() {
return string.toString();
}
public EnumSet<HtmlTag> tags() {
return tags;
}
public void append(String s) {
string.append(' ').append(s);
}
public String toString() {
return "[" + tags.toString() + ":" + string.toString() + "]";
}
public int length() {
return string.length();
}
}

View File

@ -1,28 +0,0 @@
package nu.marginalia.language.encoding;
import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class SentenceExtractorHtmlTagCleanerTest {
final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner();
public String cleanTag(String text) {
var doc = Jsoup.parse(text);
tagCleaner.clean(doc);
return doc.text();
}
@Test
public void testBriefCodeTag() {
assertEquals("hello", cleanTag("<code>hello</code>"));
assertEquals("System out println", cleanTag("<code>System.out.println</code>"));
assertEquals("hello", cleanTag("<code>hello()</code>"));
assertEquals("hello", cleanTag("<code>&lt;hello&gt;</code>"));
assertEquals("hello", cleanTag("<code>hello(p,q)</code>"));
assertEquals("hello", cleanTag("<code>hello(p,q);</code>"));
}
}

View File

@ -1,14 +1,17 @@
package nu.marginalia.language.sentence;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.sentence.tag.HtmlTag;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Objects;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
class SentenceExtractorTest {
private static SentenceExtractor sentenceExtractor;
@ -20,26 +23,25 @@ class SentenceExtractorTest {
@Test
void testParen() {
var dld = sentenceExtractor.extractSentence("I am (very) tall");
var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class));
System.out.println(dld);
}
@Test
void testPolishArtist() {
var dld = sentenceExtractor.extractSentence("Uklański");
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
assertEquals(1, dld.words.length);
assertEquals("Uklanski", dld.words[0]);
assertEquals(1, dld.wordsLowerCase.length);
assertEquals("uklanski", dld.wordsLowerCase[0]);
}
@Test
void testJava() {
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API");
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
assertEquals(4, dld.words.length);
assertArrayEquals(new String[] {"Foreign", "Function", "Memory", "API"}, dld.words);
assertEquals(4, dld.wordsLowerCase.length);
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
}
@Test
@ -77,10 +79,9 @@ class SentenceExtractorTest {
}
@Test
void testApostrophe() {
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun");
assertEquals(7, dld.words.length);
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
assertEquals(7, dld.wordsLowerCase.length);
assertArrayEquals(new String[] { "duke", "nuke", "em's", "big", "ol", "big", "gun"}, dld.words);
assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase);
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.language.sentence.tag;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
class HtmlStringTaggerTest {
@Test
public void test() {
String html = """
<!DOCTYPE html>
<html>
<head>
<title>T Example</title>
</head>
<body>
<h1>H1 Example</h1>
<p>This is an example.</p>
<p>Here is more text.</p>
<p>And more text <a href="#">with a link</a> and more text.</p>
<code>#include &lt;stdlib.h&gt;</code>
<h3>Good bye</h3>
</body>
""";
var visitor = new HtmlStringTagger();
Jsoup.parse(html).traverse(visitor);
visitor.getOutput().forEach(ts -> System.out.println(ts.string() + " " + ts.tags()));
}
}

View File

@ -39,10 +39,6 @@ public class TitleExtractor {
title = getFirstTagText(doc, "h5");
if (title != null) return title;
if (dld.sentences.length > 0) {
return dld.sentences[0].originalSentence;
}
return url;
}