Refactor sentence extractor to break it apart into more readable chunks

This commit is contained in:
Viktor Lofgren 2023-01-30 09:36:11 +01:00
parent ed728b2680
commit 50862a2081
15 changed files with 323 additions and 267 deletions

View File

@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordCounter;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.NameCounter;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
@ -68,9 +68,6 @@ public class DocumentDebugger {
Set<String> reps = new HashSet<>();
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
for (var sent : languageData.titleSentences) {

View File

@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import java.util.Arrays;
import java.util.stream.Stream;
/**
* @see nu.marginalia.util.language.processing.SentenceExtractor
* @see SentenceExtractor
*/
@AllArgsConstructor
public class DocumentLanguageData {

View File

@ -1,16 +1,14 @@
package nu.marginalia.util.language.processing;
package nu.marginalia.util.language.processing.sentence;
import com.github.datquocnguyen.RDRPOSTagger;
import com.github.jknack.handlebars.internal.lang3.StringUtils;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.SneakyThrows;
import nu.marginalia.util.StringPool;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.HtmlTagCleaner;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
@ -24,25 +22,22 @@ import javax.inject.Inject;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.*;
import java.util.regex.Pattern;
import static nu.marginalia.util.language.WordPatterns.*;
public class SentenceExtractor {
private SentenceDetectorME sentenceDetector;
private final RDRPOSTagger rdrposTagger;
private final PorterStemmer porterStemmer = new PorterStemmer();
private boolean legacyMode = false;
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
private final ThreadLocal<StringPool> stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000));
@SneakyThrows @Inject
public SentenceExtractor(LanguageModels models) {
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
@ -66,6 +61,22 @@ public class SentenceExtractor {
final String text = asText(doc);
final DocumentSentence[] textSentences = extractSentencesFromString(text);
String title = getTitle(doc, textSentences);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts);
}
public DocumentLanguageData extractSentences(String text, String title) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
}
private String getTitle(Document doc, DocumentSentence[] textSentences) {
String title = doc.getElementsByTag("title").text() + " . " +
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
@ -82,34 +93,7 @@ public class SentenceExtractor {
}
}
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts);
}
public DocumentLanguageData extractSentences(String text) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
String title = "";
for (DocumentSentence textSentence : textSentences) {
if (textSentence.length() > 0) {
title = textSentence.originalSentence.toLowerCase();
break;
}
}
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
}
public DocumentLanguageData extractSentences(String text, String title) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
return title;
}
@ -125,79 +109,95 @@ public class SentenceExtractor {
return counts;
}
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
private boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
private String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
newChars[pi++] = ' ';
}
}
s = new String(newChars, 0, pi);
if (s.startsWith(".")) {
s = s.substring(1);
if (s.isBlank())
return "";
}
return s;
}
public DocumentSentence extractSentence(String text) {
var wordsAndSeps = splitSegment(text);
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
var words = wordsAndSeps.words;
var seps = wordsAndSeps.separators;
var lc = toLc(wordsAndSeps.words);
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
return new DocumentSentence(
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
);
}
public String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public DocumentSentence[] extractSentencesFromString(String text) {
String[] sentences;
String textNormalizedSpaces = normalizeSpaces(text);
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
catch (Exception ex) {
// shitty fallback logic
sentences = StringUtils.split(textNormalizedSpaces, '.');
}
sentences = preCleanSentences(sentences);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
final String[][] posTags = new String[sentences.length][];
final String[][] tokensLc = new String[sentences.length][];
final String[][] stemmedWords = new String[sentences.length][];
for (int i = 0; i < tokens.length; i++) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
tokens[i] = wordsAndSeps.words;
separators[i] = wordsAndSeps.separators;
if (tokens[i].length > 250) {
tokens[i] = Arrays.copyOf(tokens[i], 250);
separators[i] = Arrays.copyOf(separators[i], 250);
}
for (int j = 0; j < tokens[i].length; j++) {
while (tokens[i][j].endsWith(".")) {
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
}
}
}
var sPool = stringPool.get();
for (int i = 0; i < tokens.length; i++) {
tokens[i] = sPool.internalize(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
// don't need to internalize this
}
for (int i = 0; i < tokens.length; i++) {
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
tokensLc[i] = sPool.internalize(tokensLc[i]);
}
for (int i = 0; i < tokens.length; i++) {
stemmedWords[i] = stemSentence(tokensLc[i]);
stemmedWords[i] = sPool.internalize(stemmedWords[i]);
}
DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) {
String fullString;
if (i == 0) {
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
}
else {
fullString = "";
}
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
}
return ret;
}
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private String[] preCleanSentences(String[] sentences) {
if (sentences.length > 250) {
sentences = Arrays.copyOf(sentences, 250);
}
@ -212,53 +212,13 @@ public class SentenceExtractor {
sentenceList.add(s);
}
}
sentences = sentenceList.toArray(String[]::new);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
final String[][] posTags = new String[sentences.length][];
final String[][] tokensLc = new String[sentences.length][];
final String[][] stemmedWords = new String[sentences.length][];
for (int i = 0; i < tokens.length; i++) {
var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]);
tokens[i] = wordsAndSeps.words;
separators[i] = wordsAndSeps.separators;
if (tokens[i].length > 250) {
tokens[i] = Arrays.copyOf(tokens[i], 250);
separators[i] = Arrays.copyOf(separators[i], 250);
}
for (int j = 0; j < tokens[i].length; j++) {
while (tokens[i][j].endsWith(".")) {
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
}
}
}
for (int i = 0; i < tokens.length; i++) {
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
tokensLc[i] = toLc(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
stemmedWords[i] = stemSentence(tokensLc[i]);
}
DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) {
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
}
return ret;
return sentenceList.toArray(String[]::new);
}
private String[] stemSentence(String[] strings) {
String[] stemmed = new String[strings.length];
for (int i = 0; i < stemmed.length; i++) {
var sent = cleanPossessive(strings[i]);
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
try {
stemmed[i] = porterStemmer.stem(sent);
}
@ -269,27 +229,6 @@ public class SentenceExtractor {
return stemmed;
}
private String cleanPossessive(String s) {
int end = s.length();
if (s.endsWith("\'")) {
return s.substring(0, end-1);
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
return s.substring(0, end-2).toLowerCase();
}
else {
return s;
}
}
private String[] toLc(String[] words) {
String[] lower = new String[words.length];
for (int i = 0; i < lower.length; i++) {
lower[i] = cleanPossessive(words[i]).toLowerCase();
}
return lower;
}
public String asText(Document dc) {
tagCleaner.clean(dc);
@ -299,67 +238,6 @@ public class SentenceExtractor {
return text.substring(0, (int) (text.length()*0.95));
}
@AllArgsConstructor @Getter
private static class WordsAndSeparators {
String[] words;
int[] separators;
}
private WordsAndSeparators splitSegment(String segment) {
var matcher = wordBreakPattern.matcher(segment);
List<String> words = new ArrayList<>(segment.length()/6);
TIntArrayList separators = new TIntArrayList(segment.length()/6);
int start = 0;
int wordStart = 0;
while (wordStart <= segment.length()) {
if (!matcher.find(wordStart)) {
words.add(segment.substring(wordStart));
separators.add(WordSeparator.SPACE);
break;
}
if (wordStart != matcher.start()) {
words.add(segment.substring(wordStart, matcher.start()));
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
}
wordStart = matcher.end();
}
String[] parts = words.toArray(String[]::new);
int length = 0;
for (int i = 0; i < parts.length; i++) {
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
parts[i] = null;
}
else {
length++;
}
}
String[] ret = new String[length];
int[] seps = new int[length];
for (int i = 0, j=0; i < parts.length; i++) {
if (parts[i] != null) {
seps[j] = separators.getQuick(i);
ret[j++] = parts[i];
}
}
for (int i = 0; i < ret.length; i++) {
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
}
return new WordsAndSeparators(ret, seps);
}
public boolean isLegacyMode() {
return legacyMode;
}
public void setLegacyMode(boolean legacyMode) {
this.legacyMode = legacyMode;
}
}

View File

@ -0,0 +1,93 @@
package nu.marginalia.util.language.processing.sentence;
import java.util.Arrays;
import java.util.Objects;
public class SentenceExtractorStringUtils {
public static String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
boolean changed = false;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
changed = true;
newChars[pi++] = ' ';
}
}
if (changed) {
s = new String(newChars, 0, pi);
}
if (s.startsWith(".")) {
s = s.substring(1);
}
if (s.isBlank()) {
return "";
}
return s;
}
private static boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
public static String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public static String toLowerCaseStripPossessive(String word) {
String val = stripPossessive(word).toLowerCase();
if (Objects.equals(val, word)) {
return word;
}
return val;
}
public static String[] toLowerCaseStripPossessive(String[] words) {
String[] lc = new String[words.length];
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
return lc;
}
public static String stripPossessive(String s) {
int end = s.length();
if (s.endsWith("'")) {
return s.substring(0, end-1);
}
if (s.endsWith("'s") || s.endsWith("'S")) {
return s.substring(0, end-2);
}
return s;
}
}

View File

@ -0,0 +1,72 @@
package nu.marginalia.util.language.processing.sentence;
import gnu.trove.list.array.TIntArrayList;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.util.language.WordPatterns.*;
public class SentenceSegmentSplitter {
@AllArgsConstructor
@Getter
public static class SeparatedSentence {
String[] words;
int[] separators;
}
public static SeparatedSentence splitSegment(String segment) {
var matcher = wordBreakPattern.matcher(segment);
List<String> words = new ArrayList<>(segment.length()/6);
TIntArrayList separators = new TIntArrayList(segment.length()/6);
int wordStart = 0;
while (wordStart <= segment.length()) {
if (!matcher.find(wordStart)) {
words.add(segment.substring(wordStart));
separators.add(WordSeparator.SPACE);
break;
}
if (wordStart != matcher.start()) {
words.add(segment.substring(wordStart, matcher.start()));
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
}
wordStart = matcher.end();
}
String[] parts = words.toArray(String[]::new);
int length = 0;
for (int i = 0; i < parts.length; i++) {
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
parts[i] = null;
}
else {
length++;
}
}
String[] ret = new String[length];
int[] seps = new int[length];
for (int i = 0, j=0; i < parts.length; i++) {
if (parts[i] != null) {
seps[j] = separators.getQuick(i);
ret[j++] = parts[i];
}
}
for (int i = 0; i < ret.length; i++) {
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
}
return new SeparatedSentence(ret, seps);
}
}

View File

@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
@ -18,11 +19,10 @@ import javax.annotation.Nullable;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@ -101,12 +101,15 @@ public class TermFrequencyDict {
fjp.execute(() -> {
TLongHashSet words = new TLongHashSet(10_000);
for (var doc : domain.doc) {
if (doc.documentBody == null)
continue;
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody);
Document parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.get().extractSentences(parsed);
@ -115,28 +118,30 @@ public class TermFrequencyDict {
return;
}
Set<String> words = new HashSet<>(10_000);
for (var sent : dld.sentences) {
for (var word : sent) {
words.add(word.stemmed());
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
}
fjp.execute(() -> {
synchronized (counts) {
for (var word : words) {
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
}
}
});
synchronized (counts) {
words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1);
return true;
});
}
words.clear();
}
System.out.println(domain.domain + "\t" + counts.size());
});
}
fjp.shutdown();
fjp.awaitTermination(10, TimeUnit.SECONDS);
fjp.awaitTermination(10, TimeUnit.DAYS);
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
synchronized (counts) {
@ -155,14 +160,6 @@ public class TermFrequencyDict {
}
System.out.println(docCount.get());
//
// counts.forEachEntry((w,c) -> {
// if (c > 3L) {
// System.out.println(w + ":" + c);
// }
// return true;
// });
}
public static long getStringHash(String s) {

View File

@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex;
import nu.marginalia.util.gregex.GuardedRegexFactory;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
@ -178,11 +178,13 @@ public class DocumentProcessor {
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
throws DisqualifiedException, URISyntaxException {
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
String documentBody = crawledDocument.documentBody.decode();
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
}
Document doc = Jsoup.parse(crawledDocument.documentBody);
Document doc = Jsoup.parse(documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
// I've never encountered a website where this hasn't been a severe indicator

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.integration.wikipedia;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;

View File

@ -8,7 +8,7 @@ import lombok.Getter;
import lombok.ToString;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
@ -25,12 +25,12 @@ public class QueryVariants {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final KeywordExtractor keywordExtractor;
private final SentenceExtractor sentenceExtractor;
private final TermFrequencyDict dict;
private final PorterStemmer ps = new PorterStemmer();
private final NGramBloomFilter nGramBloomFilter;
private final EnglishDictionary englishDictionary;
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
@Inject
public QueryVariants(LanguageModels lm,
@ -40,7 +40,7 @@ public class QueryVariants {
this.nGramBloomFilter = nGramBloomFilter;
this.englishDictionary = englishDictionary;
this.keywordExtractor = new KeywordExtractor();
this.sentenceExtractor = new SentenceExtractor(lm);
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
this.dict = dict;
}
@ -78,10 +78,8 @@ public class QueryVariants {
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
logger.debug("Q: {}", query);
logger.debug("QAS: {}", joinedQuery);
var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery);
var se = sentenceExtractor.get();
var sentence = se.extractSentence(joinedQuery.joinedQuery);
for (int i = 0; i < sentence.posTags.length; i++) {
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.ConverterModule;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
@ -63,7 +63,7 @@ public class ConverterLogicTestTool {
if (doc.documentBody == null) continue;
Runnable task = () -> {
var parsed = Jsoup.parse(doc.documentBody);
var parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
var dld = se.extractSentences(parsed);

View File

@ -6,15 +6,18 @@ import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.apache.commons.lang3.tuple.Pair;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@ -26,6 +29,7 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
@Tag("slow")
class SentenceExtractorTest {
@ -38,7 +42,6 @@ class SentenceExtractorTest {
newSe = new SentenceExtractor(lm);
legacySe = new SentenceExtractor(lm);
legacySe.setLegacyMode(true);
}
@ -83,7 +86,7 @@ class SentenceExtractorTest {
var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
Map<String, Integer> counts = new HashMap<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
for (WordSpan kw : keywordExtractor.getProperNames(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
@ -145,7 +148,22 @@ class SentenceExtractorTest {
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
System.out.println(newRes);
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.limit(100)
.map(Pair::getKey)
.toArray(String[]::new);
System.out.println(Arrays.toString(terms));
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
.limit(100)
.map(Pair::getKey)
.toArray(String[]::new);
System.out.println(Arrays.toString(terms2));
System.out.println("--");
}
System.out.println(System.currentTimeMillis() - st);

View File

@ -4,7 +4,7 @@ import nu.marginalia.util.ParallelPipe;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;

View File

@ -6,7 +6,7 @@ import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.DocumentDebugger;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll;