mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(sentence-extractor) Add tag information to document language data
Decorates DocumentSentences with information about which HTML tags they are nested in, and removes some redundant data on this rather memory hungry object. Separator information is encoded as a bit set instead of an array of integers. The change also cleans up the SentenceExtractor class a fair bit. It no longer extracts ngrams, and a significant amount of redundant operations were removed as well. This is still a pretty unpleasant class to work in, but this is the first step in making it a little bit better.
This commit is contained in:
parent
d36055a2d0
commit
22b35d5d91
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
@ -55,7 +56,7 @@ public class AnchorTextKeywords {
|
||||
if (stopList.contains(keyword.text().toLowerCase()))
|
||||
continue;
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(keyword.text());
|
||||
var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.noneOf(HtmlTag.class));
|
||||
for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) {
|
||||
wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum);
|
||||
}
|
||||
|
@ -27,7 +27,6 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.nio.file.attribute.PosixFilePermissions;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@ -124,10 +123,6 @@ public class TermFrequencyExporter implements ExporterIf {
|
||||
for (var word : sent) {
|
||||
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
words.add(longHash(ngram.getBytes()));
|
||||
}
|
||||
}
|
||||
|
||||
synchronized (counts) {
|
||||
|
@ -134,15 +134,6 @@ public class DocumentKeywordExtractor {
|
||||
wordsBuilder.addMeta(rep.word, meta);
|
||||
}
|
||||
|
||||
for (int i = 0; i < sent.ngrams.length; i++) {
|
||||
var ngram = sent.ngrams[i];
|
||||
var ngramStemmed = sent.ngramStemmed[i];
|
||||
|
||||
long meta = metadata.getMetadataForWord(ngramStemmed);
|
||||
|
||||
wordsBuilder.addMeta(ngram, meta);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.keyword;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.ArrayList;
|
||||
@ -20,15 +19,15 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { continue; }
|
||||
|
||||
if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence))
|
||||
spans.add(new WordSpan(i-1, i+1));
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
|
||||
|
||||
if (isProperNoun(i, sentence)
|
||||
&& (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
||||
@ -37,9 +36,9 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (sentence.isSeparatorComma(i-3)) { continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
|
||||
|
||||
if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence))
|
||||
@ -66,7 +65,7 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { continue; }
|
||||
|
||||
if (isNoun(i, sentence)
|
||||
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
|
||||
@ -75,8 +74,8 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
|
||||
|
||||
if ((isNoun(i, sentence))
|
||||
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
|
||||
@ -85,9 +84,9 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (sentence.isSeparatorComma(i-3)) { continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
|
||||
|
||||
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
|
||||
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
|
||||
@ -119,7 +118,7 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { continue; }
|
||||
|
||||
if (isName(i, sentence)) {
|
||||
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
|
||||
@ -131,8 +130,8 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { continue; }
|
||||
|
||||
if (isName(i, sentence)) {
|
||||
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
|
||||
@ -149,9 +148,9 @@ public class KeywordExtractor {
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.isSeparatorComma(i-1)) { i+=2; continue; }
|
||||
if (sentence.isSeparatorComma(i-2)) { i++; continue; }
|
||||
if (sentence.isSeparatorComma(i-3)) { continue; }
|
||||
|
||||
if (isName(i, sentence) &&
|
||||
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
|
||||
@ -217,7 +216,7 @@ public class KeywordExtractor {
|
||||
private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) {
|
||||
|
||||
for (int i = w.start; i < w.end-1; i++) {
|
||||
if (sentence.separators[i] == WordSeparator.COMMA) {
|
||||
if (sentence.isSeparatorComma(i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1,13 +1,12 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@ -21,13 +20,11 @@ public class NameLikeKeywords implements WordReps {
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase);
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start]))
|
||||
if (span.size() <= 1 && sent.isAllCaps(span.start))
|
||||
continue;
|
||||
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
@ -6,7 +6,6 @@ import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.model.WordSpan;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
@ -36,8 +35,7 @@ public class SubjectLikeKeywords implements WordReps {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
if (sentence.separators[kw.end] == WordSeparator.COMMA
|
||||
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
||||
if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1))
|
||||
continue;
|
||||
|
||||
String nextTag = sentence.posTags[kw.end];
|
||||
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.keyword.WordReps;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -16,7 +16,8 @@ public class TitleKeywords implements WordReps {
|
||||
private final Set<String> stemmed;
|
||||
|
||||
public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) {
|
||||
titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
|
||||
titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream()
|
||||
.flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
.limit(100)
|
||||
.collect(Collectors.toSet());
|
||||
|
@ -2,11 +2,11 @@ package nu.marginalia.keyword;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
@ -59,8 +59,8 @@ class SentenceExtractorTest {
|
||||
|
||||
@Test
|
||||
public void testACDC() {
|
||||
var ret = se.extractSentence("AC/DC is a rock band.");
|
||||
assertEquals("AC/DC", ret.words[0]);
|
||||
var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals("ac/dc", ret.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
final Pattern p = Pattern.compile("([, ]+)");
|
||||
|
@ -190,7 +190,9 @@ class TitleKeywordsTest {
|
||||
public void extractTitleWords() {
|
||||
var se = new SentenceExtractor(TestLanguageModels.getLanguageModels());
|
||||
|
||||
var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps();
|
||||
var dld = se.extractSentences(Jsoup.parse(document));
|
||||
|
||||
var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps();
|
||||
var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet());
|
||||
|
||||
Set<String> expected = Set.of(
|
||||
|
@ -2,10 +2,10 @@ package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.sentence.SentenceExtractorStringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class QueryTokenizer {
|
||||
@ -55,7 +55,7 @@ public class QueryTokenizer {
|
||||
}
|
||||
|
||||
String displayStr = query.substring(i, end);
|
||||
String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr);
|
||||
String str = toLowerCaseStripPossessive(displayStr);
|
||||
|
||||
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
|
||||
|
||||
@ -65,5 +65,27 @@ public class QueryTokenizer {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
public static String toLowerCaseStripPossessive(String word) {
|
||||
String val = stripPossessive(word).toLowerCase();
|
||||
|
||||
if (Objects.equals(val, word)) {
|
||||
return word;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String stripPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("'")) {
|
||||
return s.substring(0, end-1);
|
||||
}
|
||||
|
||||
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||
return s.substring(0, end-2);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
@ -1,49 +1,41 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
/** Holds the sentences and text of a document, decorated with
|
||||
* HTML tags, POS tags, and other information.
|
||||
*
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
@AllArgsConstructor
|
||||
public class DocumentLanguageData {
|
||||
public final DocumentSentence[] sentences;
|
||||
public final DocumentSentence[] titleSentences;
|
||||
public final TObjectIntHashMap<String> wordCount;
|
||||
public final String text;
|
||||
|
||||
/** for test convenience */
|
||||
public static DocumentLanguageData empty() {
|
||||
return new DocumentLanguageData(
|
||||
new DocumentSentence[0],
|
||||
new DocumentSentence[0],
|
||||
new TObjectIntHashMap<>(),
|
||||
""
|
||||
);
|
||||
public DocumentLanguageData(List<DocumentSentence> sentences,
|
||||
String text) {
|
||||
this.sentences = sentences.toArray(DocumentSentence[]::new);
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public List<DocumentSentence> findSentencesForTag(HtmlTag tag) {
|
||||
return Arrays.stream(sentences).filter(s -> s.htmlTags.contains(tag)).toList();
|
||||
}
|
||||
|
||||
public int totalNumWords() {
|
||||
int ret = 0;
|
||||
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
ret += sentences[i].length();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public Stream<String> streamLowerCase() {
|
||||
return Arrays.stream(sentences).map(sent -> sent.wordsLowerCase).flatMap(Arrays::stream);
|
||||
}
|
||||
|
||||
public Stream<String> stream() {
|
||||
return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream);
|
||||
}
|
||||
|
||||
public long localitySensitiveHashCode() {
|
||||
var hash = new EasyLSH();
|
||||
|
||||
|
@ -2,52 +2,55 @@ package nu.marginalia.language.model;
|
||||
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.BitSet;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public final String originalSentence;
|
||||
public final String[] words;
|
||||
public final int[] separators;
|
||||
|
||||
/** A span of words in a sentence */
|
||||
|
||||
public final String[] wordsLowerCase;
|
||||
public final String[] posTags;
|
||||
public final String[] stemmedWords;
|
||||
public final String[] ngrams;
|
||||
public final String[] ngramStemmed;
|
||||
|
||||
public final EnumSet<HtmlTag> htmlTags;
|
||||
|
||||
private final BitSet isStopWord;
|
||||
private final BitSet separators;
|
||||
private final BitSet isCapitalized;
|
||||
private final BitSet isAllCaps;
|
||||
|
||||
|
||||
|
||||
public SoftReference<WordSpan[]> keywords;
|
||||
|
||||
public DocumentSentence(String originalSentence,
|
||||
String[] words,
|
||||
int[] separators,
|
||||
public DocumentSentence(BitSet separators,
|
||||
String[] wordsLowerCase,
|
||||
String[] posTags,
|
||||
String[] stemmedWords,
|
||||
String[] ngrams,
|
||||
String[] ngramsStemmed
|
||||
EnumSet<HtmlTag> htmlTags,
|
||||
BitSet isCapitalized,
|
||||
BitSet isAllCaps
|
||||
)
|
||||
{
|
||||
this.originalSentence = originalSentence;
|
||||
this.words = words;
|
||||
this.separators = separators;
|
||||
this.wordsLowerCase = wordsLowerCase;
|
||||
this.posTags = posTags;
|
||||
this.stemmedWords = stemmedWords;
|
||||
this.htmlTags = htmlTags;
|
||||
this.isCapitalized = isCapitalized;
|
||||
this.isAllCaps = isAllCaps;
|
||||
|
||||
isStopWord = new BitSet(words.length);
|
||||
isStopWord = new BitSet(wordsLowerCase.length);
|
||||
|
||||
this.ngrams = ngrams;
|
||||
this.ngramStemmed = ngramsStemmed;
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
if (WordPatterns.isStopWord(words[i]))
|
||||
for (int i = 0; i < wordsLowerCase.length; i++) {
|
||||
if (WordPatterns.isStopWord(wordsLowerCase[i]))
|
||||
isStopWord.set(i);
|
||||
}
|
||||
}
|
||||
@ -55,14 +58,22 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public boolean isStopWord(int idx) {
|
||||
return isStopWord.get(idx);
|
||||
}
|
||||
public void setIsStopWord(int idx, boolean val) {
|
||||
if (val)
|
||||
isStopWord.set(idx);
|
||||
else
|
||||
isStopWord.clear();
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return words.length;
|
||||
return wordsLowerCase.length;
|
||||
}
|
||||
|
||||
public boolean isCapitalized(int i) {
|
||||
return isCapitalized.get(i);
|
||||
}
|
||||
public boolean isAllCaps(int i) {
|
||||
return isAllCaps.get(i);
|
||||
}
|
||||
public boolean isSeparatorSpace(int i) {
|
||||
return separators.get(i);
|
||||
}
|
||||
public boolean isSeparatorComma(int i) {
|
||||
return !separators.get(i);
|
||||
}
|
||||
|
||||
public String constructWordFromSpan(WordSpan span) {
|
||||
@ -140,9 +151,9 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
sb.append(words[i]).append('[').append(posTags[i]).append(']');
|
||||
if (separators[i] == WordSeparator.COMMA) {
|
||||
for (int i = 0; i < wordsLowerCase.length; i++) {
|
||||
sb.append(wordsLowerCase[i]).append('[').append(posTags[i]).append(']');
|
||||
if (isSeparatorComma(i)) {
|
||||
sb.append(',');
|
||||
}
|
||||
else {
|
||||
@ -176,11 +187,10 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
this.pos = pos;
|
||||
}
|
||||
|
||||
public String word() { return words[pos]; }
|
||||
public String word() { return wordsLowerCase[pos]; }
|
||||
public String wordLowerCase() { return wordsLowerCase[pos]; }
|
||||
public String posTag() { return posTags[pos]; }
|
||||
public String stemmed() { return stemmedWords[pos]; }
|
||||
public int separator() { return separators[pos]; }
|
||||
public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); }
|
||||
|
||||
public WordRep rep() {
|
||||
|
@ -1,6 +0,0 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
public final class WordSeparator {
|
||||
public static final int COMMA = 0;
|
||||
public static final int SPACE = 1;
|
||||
}
|
@ -1,23 +1,23 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import nu.marginalia.language.sentence.tag.HtmlStringTagger;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTaggedString;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -38,14 +38,13 @@ public class SentenceExtractor {
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner();
|
||||
private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner();
|
||||
|
||||
/* Truncate sentences longer than this. This is mostly a defense measure against malformed data
|
||||
* that might otherwise use an undue amount of processing power. 250 words is about 10X longer than
|
||||
* this comment. */
|
||||
private static final int MAX_SENTENCE_LENGTH = 250;
|
||||
private static final int MAX_TEXT_LENGTH = 65536;
|
||||
static final int MAX_SENTENCE_LENGTH = 250;
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@SneakyThrows @Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
@ -75,219 +74,224 @@ public class SentenceExtractor {
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
public DocumentLanguageData extractSentences(Document doc) {
|
||||
var clone = doc.clone();
|
||||
tagCleaner.clean(clone);
|
||||
|
||||
final String text = asText(clone);
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
final List<HtmlTaggedString> taggedStrings = HtmlStringTagger.tagDocumentStrings(doc);
|
||||
final List<DocumentSentence> textSentences = new ArrayList<>();
|
||||
|
||||
String title = getTitle(clone, textSentences);
|
||||
final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum();
|
||||
final StringBuilder documentText = new StringBuilder(totalTextLength + taggedStrings.size());
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||
for (var taggedString : taggedStrings) {
|
||||
String text = taggedString.string();
|
||||
|
||||
textSentences.addAll(
|
||||
extractSentencesFromString(text, taggedString.tags())
|
||||
);
|
||||
|
||||
if (documentText.isEmpty()) {
|
||||
documentText.append(text);
|
||||
}
|
||||
else {
|
||||
documentText.append(' ').append(text);
|
||||
}
|
||||
}
|
||||
|
||||
return new DocumentLanguageData(textSentences, documentText.toString());
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class));
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE));
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||
List<DocumentSentence> combined = new ArrayList<>(textSentences.size() + titleSentences.size());
|
||||
combined.addAll(titleSentences);
|
||||
combined.addAll(textSentences);
|
||||
|
||||
return new DocumentLanguageData(
|
||||
combined,
|
||||
text);
|
||||
}
|
||||
|
||||
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||
String title = doc.getElementsByTag("title").text() + " . " +
|
||||
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
|
||||
public DocumentSentence extractSentence(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH);
|
||||
|
||||
if (title.trim().length() < 3) {
|
||||
title = doc.getElementsByTag("h2").text();
|
||||
}
|
||||
String[] words = wordsAndSeps.words();
|
||||
BitSet seps = wordsAndSeps.separators();
|
||||
String[] lc = new String[words.length];
|
||||
String[] stemmed = new String[words.length];
|
||||
|
||||
if (title.trim().length() < 3) {
|
||||
for (DocumentSentence textSentence : textSentences) {
|
||||
if (textSentence.length() > 0) {
|
||||
title = textSentence.originalSentence.toLowerCase();
|
||||
break;
|
||||
}
|
||||
BitSet isCapitalized = new BitSet(words.length);
|
||||
BitSet isAllCaps = new BitSet(words.length);
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
lc[i] = stripPossessive(words[i].toLowerCase());
|
||||
|
||||
if (words[i].length() > 0 && Character.isUpperCase(words[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
return title;
|
||||
}
|
||||
|
||||
|
||||
@NotNull
|
||||
private TObjectIntHashMap<String> calculateWordCounts(DocumentSentence[] textSentences) {
|
||||
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(textSentences.length*10, 0.5f, 0);
|
||||
|
||||
for (var sent : textSentences) {
|
||||
for (var word : sent.stemmedWords) {
|
||||
counts.adjustOrPutValue(word, 1, 1);
|
||||
}
|
||||
}
|
||||
return counts;
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
|
||||
|
||||
var words = wordsAndSeps.words;
|
||||
var seps = wordsAndSeps.separators;
|
||||
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||
|
||||
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, words);
|
||||
|
||||
String[] ngramsWords = new String[ngrams.size()];
|
||||
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||
for (int i = 0; i < ngrams.size(); i++) {
|
||||
String[] ngram = ngrams.get(i);
|
||||
|
||||
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||
for (String s : ngram) {
|
||||
ngramJoiner.add(s);
|
||||
stemmedJoiner.add(porterStemmer.stem(s));
|
||||
if (StringUtils.isAllUpperCase(words[i])) {
|
||||
isAllCaps.set(i);
|
||||
}
|
||||
|
||||
ngramsWords[i] = ngramJoiner.toString();
|
||||
ngramsStemmedWords[i] = stemmedJoiner.toString();
|
||||
}
|
||||
|
||||
|
||||
return new DocumentSentence(
|
||||
SentenceExtractorStringUtils.sanitizeString(text),
|
||||
words,
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
stemSentence(lc),
|
||||
ngramsWords,
|
||||
ngramsStemmedWords
|
||||
);
|
||||
}
|
||||
|
||||
public DocumentSentence[] extractSentencesFromString(String text) {
|
||||
String[] sentences;
|
||||
|
||||
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
|
||||
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// shitty fallback logic
|
||||
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||
}
|
||||
|
||||
sentences = sentencePrecleaner.clean(sentences);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
final String[][] posTags = new String[sentences.length][];
|
||||
final String[][] tokensLc = new String[sentences.length][];
|
||||
final String[][] stemmedWords = new String[sentences.length][];
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
|
||||
tokens[i] = wordsAndSeps.words;
|
||||
separators[i] = wordsAndSeps.separators;
|
||||
|
||||
if (tokens[i].length > MAX_SENTENCE_LENGTH) {
|
||||
tokens[i] = Arrays.copyOf(tokens[i], MAX_SENTENCE_LENGTH);
|
||||
separators[i] = Arrays.copyOf(separators[i], MAX_SENTENCE_LENGTH);
|
||||
}
|
||||
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||
}
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
String fullString;
|
||||
|
||||
if (i == 0) {
|
||||
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
|
||||
}
|
||||
else {
|
||||
fullString = "";
|
||||
}
|
||||
|
||||
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]);
|
||||
|
||||
String[] ngramsWords = new String[ngrams.size()];
|
||||
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||
|
||||
for (int j = 0; j < ngrams.size(); j++) {
|
||||
String[] ngram = ngrams.get(j);
|
||||
|
||||
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||
for (String s : ngram) {
|
||||
ngramJoiner.add(s);
|
||||
stemmedJoiner.add(porterStemmer.stem(s));
|
||||
}
|
||||
|
||||
ngramsWords[j] = ngramJoiner.toString();
|
||||
ngramsStemmedWords[j] = stemmedJoiner.toString();
|
||||
}
|
||||
|
||||
|
||||
ret[i] = new DocumentSentence(fullString,
|
||||
tokens[i],
|
||||
separators[i],
|
||||
tokensLc[i],
|
||||
posTags[i],
|
||||
stemmedWords[i],
|
||||
ngramsWords,
|
||||
ngramsStemmedWords
|
||||
);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private String[] stemSentence(String[] strings) {
|
||||
String[] stemmed = new String[strings.length];
|
||||
for (int i = 0; i < stemmed.length; i++) {
|
||||
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(sent);
|
||||
stemmed[i] = porterStemmer.stem(lc[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
}
|
||||
}
|
||||
return stemmed;
|
||||
|
||||
return new DocumentSentence(
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
stemmed,
|
||||
htmlTags,
|
||||
isCapitalized,
|
||||
isAllCaps
|
||||
);
|
||||
}
|
||||
|
||||
public String asText(Document dc) {
|
||||
String text = dc.getElementsByTag("body").text();
|
||||
public List<DocumentSentence> extractSentencesFromString(String text, EnumSet<HtmlTag> htmlTags) {
|
||||
String[] sentences;
|
||||
|
||||
if (text.length() > MAX_TEXT_LENGTH) {
|
||||
return text.substring(0, MAX_TEXT_LENGTH);
|
||||
// Normalize spaces
|
||||
|
||||
text = normalizeSpaces(text);
|
||||
|
||||
// Split into sentences
|
||||
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(text);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// shitty fallback logic
|
||||
sentences = StringUtils.split(text, '.');
|
||||
}
|
||||
|
||||
sentences = sentencePrecleaner.clean(sentences);
|
||||
|
||||
// Truncate the number of sentences if it exceeds the maximum, to avoid
|
||||
// excessive processing time on malformed data
|
||||
|
||||
if (sentences.length > MAX_SENTENCE_COUNT) {
|
||||
sentences = Arrays.copyOf(sentences, MAX_SENTENCE_COUNT);
|
||||
}
|
||||
|
||||
final boolean isNaturalLanguage = htmlTags.stream().noneMatch(tag -> tag.nonLanguage);
|
||||
|
||||
List<DocumentSentence> ret = new ArrayList<>(sentences.length);
|
||||
|
||||
if (isNaturalLanguage) {
|
||||
// Natural language text; do POS tagging and stemming
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = rdrposTagger.tagsForEnSentence(tokens);
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
isAllCaps.set(i);
|
||||
}
|
||||
|
||||
var originalVal = tokens[i];
|
||||
var newVal = stripPossessive(originalVal.toLowerCase());
|
||||
|
||||
if (Objects.equals(originalVal, newVal)) {
|
||||
tokensLc[i] = originalVal;
|
||||
} else {
|
||||
tokensLc[i] = newVal;
|
||||
}
|
||||
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(tokens[i]);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
stemmed[i] = "NN"; // ???
|
||||
}
|
||||
}
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps));
|
||||
}
|
||||
}
|
||||
else {
|
||||
return text.substring(0, (int) (text.length() * 0.95));
|
||||
// non-language text, e.g. program code; don't bother with POS tagging or stemming
|
||||
// as this is not likely to be useful
|
||||
|
||||
for (String sent : sentences) {
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH);
|
||||
var tokens = wordsAndSeps.words();
|
||||
var separators = wordsAndSeps.separators();
|
||||
var posTags = new String[tokens.length];
|
||||
Arrays.fill(posTags, "X"); // Placeholder POS tag
|
||||
var tokensLc = new String[tokens.length];
|
||||
var stemmed = new String[tokens.length];
|
||||
|
||||
BitSet isCapitalized = new BitSet(tokens.length);
|
||||
BitSet isAllCaps = new BitSet(tokens.length);
|
||||
|
||||
for (int i = 0; i < tokensLc.length; i++) {
|
||||
var originalVal = tokens[i];
|
||||
|
||||
if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) {
|
||||
isCapitalized.set(i);
|
||||
}
|
||||
if (StringUtils.isAllUpperCase(tokens[i])) {
|
||||
isAllCaps.set(i);
|
||||
}
|
||||
|
||||
if (StringUtils.isAllLowerCase(originalVal)) {
|
||||
tokensLc[i] = originalVal;
|
||||
} else {
|
||||
tokensLc[i] = originalVal.toLowerCase();
|
||||
}
|
||||
stemmed[i] = tokensLc[i]; // we don't stem non-language words
|
||||
}
|
||||
|
||||
ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
public static String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public static String stripPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("'")) {
|
||||
return s.substring(0, end-1);
|
||||
}
|
||||
|
||||
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||
return s.substring(0, end-2);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,40 +0,0 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SentenceExtractorHtmlTagCleaner {
|
||||
public final int MAX_CODE_TAG_LENGTH = 32;
|
||||
public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|<|>|<|>|\\([^)]*\\)[;]?$)");
|
||||
|
||||
public void clean(Document doc) {
|
||||
cleanCodeTags(doc);
|
||||
|
||||
doc.select("nav,form,input,code,body>title").remove();
|
||||
|
||||
// Create "sentences" out of elements that sometimes lack a period at the end to help
|
||||
// NLP work better
|
||||
doc.select("li,h1,h2,h3,h4,h5,h6,td,th,p,div,title").forEach(e -> e.appendText(". "));
|
||||
doc.select("br,hr").forEach(e -> e.prependText(". "));
|
||||
}
|
||||
|
||||
private void cleanCodeTags(Document doc) {
|
||||
for (var codeTag : doc.getElementsByTag("code")) {
|
||||
var text = codeTag.text();
|
||||
|
||||
if (text.length() <= MAX_CODE_TAG_LENGTH) {
|
||||
codeTag.replaceWith(new TextNode(trimCodeTagContents(text)));
|
||||
}
|
||||
else {
|
||||
codeTag.remove();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private String trimCodeTagContents(String text) {
|
||||
return codeTagJunkPattern.matcher(text).replaceAll(" ");
|
||||
}
|
||||
}
|
@ -1,93 +0,0 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
|
||||
public class SentenceExtractorStringUtils {
|
||||
|
||||
public static String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
changed = true;
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
s = new String(newChars, 0, pi);
|
||||
}
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
}
|
||||
|
||||
if (s.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
private static boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
public static String toLowerCaseStripPossessive(String word) {
|
||||
String val = stripPossessive(word).toLowerCase();
|
||||
|
||||
if (Objects.equals(val, word)) {
|
||||
return word;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String[] toLowerCaseStripPossessive(String[] words) {
|
||||
String[] lc = new String[words.length];
|
||||
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
|
||||
return lc;
|
||||
}
|
||||
|
||||
public static String stripPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("'")) {
|
||||
return s.substring(0, end-1);
|
||||
}
|
||||
|
||||
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||
return s.substring(0, end-2);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -7,12 +7,9 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class SentencePreCleaner {
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
private final int maxSentenceCount = 250;
|
||||
private final int maxTotalLength = 20 * maxSentenceCount;
|
||||
|
||||
public String[] clean(String[] sentences) {
|
||||
|
||||
int totalLength = 0;
|
||||
int sentenceCount = 0;
|
||||
|
||||
List<String> sentenceList = new ArrayList<>();
|
||||
@ -20,10 +17,9 @@ public class SentencePreCleaner {
|
||||
|
||||
if (s.isBlank()) continue;
|
||||
|
||||
totalLength+=s.length();
|
||||
sentenceCount++;
|
||||
|
||||
if (totalLength > maxTotalLength && sentenceCount++ > maxSentenceCount) {
|
||||
if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -2,25 +2,18 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.model.WordSeparator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static nu.marginalia.language.WordPatterns.*;
|
||||
import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
public static class SeparatedSentence {
|
||||
String[] words;
|
||||
int[] separators;
|
||||
}
|
||||
public record SeparatedSentence(String[] words, BitSet separators) { }
|
||||
|
||||
private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-");
|
||||
|
||||
@ -43,7 +36,7 @@ public class SentenceSegmentSplitter {
|
||||
* @param segment The sentence to split
|
||||
* @return A list of words and separators
|
||||
*/
|
||||
public static SeparatedSentence splitSegment(String segment) {
|
||||
public static SeparatedSentence splitSegment(String segment, int maxLength) {
|
||||
String flatSegment = AsciiFlattener.flattenUnicode(segment);
|
||||
|
||||
var matcher = wordBreakPattern.matcher(flatSegment);
|
||||
@ -77,7 +70,7 @@ public class SentenceSegmentSplitter {
|
||||
}
|
||||
|
||||
List<String> ret = new ArrayList<>(words.size());
|
||||
TIntArrayList seps = new TIntArrayList(words.size());
|
||||
BitSet seps = new BitSet(separators.size());
|
||||
|
||||
String[] parts = words.toArray(String[]::new);
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
@ -89,7 +82,9 @@ public class SentenceSegmentSplitter {
|
||||
continue;
|
||||
|
||||
ret.add(parts[i]);
|
||||
seps.add(separators.getQuick(i));
|
||||
if (separators.getQuick(i) > 0) {
|
||||
seps.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ret.size(); i++) {
|
||||
@ -101,13 +96,26 @@ public class SentenceSegmentSplitter {
|
||||
if (part.endsWith("'") && part.length() > 1) {
|
||||
ret.set(i, part.substring(0, part.length()-1));
|
||||
}
|
||||
while (part.endsWith(".")) {
|
||||
part = part.substring(0, part.length()-1);
|
||||
ret.set(i, part);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret.size() > maxLength) {
|
||||
ret.subList(maxLength, ret.size()).clear();
|
||||
seps = seps.get(0, maxLength);
|
||||
}
|
||||
|
||||
return new SeparatedSentence(
|
||||
ret.toArray(String[]::new),
|
||||
seps.toArray()
|
||||
seps
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public static final class WordSeparator {
|
||||
public static final int COMMA = 0;
|
||||
public static final int SPACE = 1;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,122 @@
|
||||
package nu.marginalia.language.sentence.tag;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** A class that tags strings in an HTML document with the HTML tags that are active at that point in the document. */
|
||||
public class HtmlStringTagger implements NodeVisitor {
|
||||
private List<HtmlTag> tagStack = new ArrayList<>(8);
|
||||
private Set<Element> stackTags = new HashSet<>(8);
|
||||
private StringBuilder currentString = new StringBuilder(256);
|
||||
|
||||
HtmlStringTagger() {}
|
||||
|
||||
public static List<HtmlTaggedString> tagDocumentStrings(Document document) {
|
||||
var tagger = new HtmlStringTagger();
|
||||
document.traverse(tagger);
|
||||
return tagger.getOutput();
|
||||
}
|
||||
|
||||
private List<HtmlTaggedString> output = new ArrayList<>();
|
||||
|
||||
public List<HtmlTaggedString> getOutput() {
|
||||
List<HtmlTaggedString> compactedOutput = new ArrayList<>(output.size());
|
||||
|
||||
for (var ts : output) {
|
||||
if (compactedOutput.isEmpty()) {
|
||||
compactedOutput.add(ts);
|
||||
}
|
||||
else {
|
||||
var last = compactedOutput.getLast();
|
||||
if (last.tags().equals(ts.tags())) {
|
||||
last.append(ts.string());
|
||||
}
|
||||
else {
|
||||
compactedOutput.add(ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void head(Node node, int i) {
|
||||
if (node instanceof Element el) {
|
||||
String tagName = el.tagName();
|
||||
switch (tagName) {
|
||||
case "script" -> pushTag(HtmlTag.SCRIPT, el);
|
||||
case "style" -> pushTag(HtmlTag.STYLE, el);
|
||||
case "code" -> pushTag(HtmlTag.CODE, el);
|
||||
case "title" -> pushTag(HtmlTag.TITLE, el);
|
||||
case "nav" -> pushTag(HtmlTag.NAV, el);
|
||||
case "header" -> pushTag(HtmlTag.HEADER, el);
|
||||
case "footer" -> pushTag(HtmlTag.FOOTER, el);
|
||||
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
|
||||
}
|
||||
}
|
||||
else if (node instanceof TextNode tn) {
|
||||
if (shouldProcess()) {
|
||||
String tnText = tn.text();
|
||||
if (!tnText.isBlank()) {
|
||||
currentString = currentString.append(' ').append(tnText.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tail(Node node, int i) {
|
||||
if (!(node instanceof Element el))
|
||||
return;
|
||||
|
||||
if (stackTags.remove(el)) {
|
||||
output.add(new HtmlTaggedString(currentString, EnumSet.copyOf(tagStack)));
|
||||
tagStack.removeLast();
|
||||
currentString = new StringBuilder();
|
||||
}
|
||||
else if ("#root".equals(el.tagName())) {
|
||||
closeOngoingTag();
|
||||
}
|
||||
}
|
||||
|
||||
private void pushTag(HtmlTag tag, Element el) {
|
||||
closeOngoingTag();
|
||||
|
||||
tagStack.add(tag);
|
||||
stackTags.add(el);
|
||||
}
|
||||
|
||||
private void closeOngoingTag() {
|
||||
if (currentString.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
EnumSet<HtmlTag> tags;
|
||||
if (tagStack.isEmpty()) {
|
||||
tags = EnumSet.noneOf(HtmlTag.class);
|
||||
}
|
||||
else {
|
||||
tags = EnumSet.copyOf(tagStack);
|
||||
}
|
||||
|
||||
output.add(new HtmlTaggedString(currentString, tags));
|
||||
currentString = new StringBuilder();
|
||||
}
|
||||
|
||||
public boolean shouldProcess() {
|
||||
for (var tag : tagStack) {
|
||||
if (tag.exclude) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.language.sentence.tag;
|
||||
|
||||
public enum HtmlTag {
|
||||
SCRIPT(true, false),
|
||||
STYLE(true, false),
|
||||
CODE(false, true),
|
||||
PRE(false, true),
|
||||
TITLE(false, false),
|
||||
HEADING(false, false),
|
||||
NAV(false, false),
|
||||
HEADER(false, false),
|
||||
FOOTER(false, false);
|
||||
|
||||
public boolean exclude;
|
||||
public boolean nonLanguage;
|
||||
|
||||
HtmlTag(boolean exclude, boolean nonLanguage) {
|
||||
this.exclude = exclude;
|
||||
this.nonLanguage = nonLanguage;
|
||||
}
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.language.sentence.tag;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
public class HtmlTaggedString {
|
||||
private StringBuilder string;
|
||||
private final EnumSet<HtmlTag> tags;
|
||||
|
||||
public HtmlTaggedString(StringBuilder string, EnumSet<HtmlTag> tags) {
|
||||
this.tags = tags;
|
||||
this.string = string;
|
||||
}
|
||||
|
||||
public String string() {
|
||||
return string.toString();
|
||||
}
|
||||
|
||||
public EnumSet<HtmlTag> tags() {
|
||||
return tags;
|
||||
}
|
||||
|
||||
public void append(String s) {
|
||||
string.append(' ').append(s);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "[" + tags.toString() + ":" + string.toString() + "]";
|
||||
}
|
||||
|
||||
public int length() {
|
||||
return string.length();
|
||||
}
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
package nu.marginalia.language.encoding;
|
||||
|
||||
import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class SentenceExtractorHtmlTagCleanerTest {
|
||||
|
||||
final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner();
|
||||
|
||||
public String cleanTag(String text) {
|
||||
var doc = Jsoup.parse(text);
|
||||
tagCleaner.clean(doc);
|
||||
return doc.text();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBriefCodeTag() {
|
||||
assertEquals("hello", cleanTag("<code>hello</code>"));
|
||||
assertEquals("System out println", cleanTag("<code>System.out.println</code>"));
|
||||
assertEquals("hello", cleanTag("<code>hello()</code>"));
|
||||
assertEquals("hello", cleanTag("<code><hello></code>"));
|
||||
assertEquals("hello", cleanTag("<code>hello(p,q)</code>"));
|
||||
assertEquals("hello", cleanTag("<code>hello(p,q);</code>"));
|
||||
}
|
||||
}
|
@ -1,14 +1,17 @@
|
||||
package nu.marginalia.language.sentence;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class SentenceExtractorTest {
|
||||
private static SentenceExtractor sentenceExtractor;
|
||||
@ -20,26 +23,25 @@ class SentenceExtractorTest {
|
||||
|
||||
@Test
|
||||
void testParen() {
|
||||
var dld = sentenceExtractor.extractSentence("I am (very) tall");
|
||||
var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
System.out.println(dld);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPolishArtist() {
|
||||
var dld = sentenceExtractor.extractSentence("Uklański");
|
||||
var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(1, dld.words.length);
|
||||
assertEquals("Uklanski", dld.words[0]);
|
||||
assertEquals(1, dld.wordsLowerCase.length);
|
||||
assertEquals("uklanski", dld.wordsLowerCase[0]);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testJava() {
|
||||
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API");
|
||||
var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class));
|
||||
|
||||
assertEquals(4, dld.words.length);
|
||||
assertArrayEquals(new String[] {"Foreign", "Function", "Memory", "API"}, dld.words);
|
||||
assertEquals(4, dld.wordsLowerCase.length);
|
||||
assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -77,10 +79,9 @@ class SentenceExtractorTest {
|
||||
}
|
||||
@Test
|
||||
void testApostrophe() {
|
||||
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun");
|
||||
assertEquals(7, dld.words.length);
|
||||
var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class));
|
||||
assertEquals(7, dld.wordsLowerCase.length);
|
||||
|
||||
assertArrayEquals(new String[] { "duke", "nuke", "em's", "big", "ol", "big", "gun"}, dld.words);
|
||||
assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase);
|
||||
}
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
package nu.marginalia.language.sentence.tag;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class HtmlStringTaggerTest {
|
||||
@Test
|
||||
public void test() {
|
||||
String html = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>T Example</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>H1 Example</h1>
|
||||
<p>This is an example.</p>
|
||||
<p>Here is more text.</p>
|
||||
<p>And more text <a href="#">with a link</a> and more text.</p>
|
||||
<code>#include <stdlib.h></code>
|
||||
<h3>Good bye</h3>
|
||||
</body>
|
||||
""";
|
||||
var visitor = new HtmlStringTagger();
|
||||
Jsoup.parse(html).traverse(visitor);
|
||||
|
||||
visitor.getOutput().forEach(ts -> System.out.println(ts.string() + " " + ts.tags()));
|
||||
}
|
||||
}
|
@ -39,10 +39,6 @@ public class TitleExtractor {
|
||||
title = getFirstTagText(doc, "h5");
|
||||
if (title != null) return title;
|
||||
|
||||
if (dld.sentences.length > 0) {
|
||||
return dld.sentences[0].originalSentence;
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user