mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Refactor sentence extractor to break it apart into more readable chunks
This commit is contained in:
parent
ed728b2680
commit
50862a2081
@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordCounter;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.NameCounter;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
@ -68,9 +68,6 @@ public class DocumentDebugger {
|
||||
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
for (var sent : languageData.titleSentences) {
|
||||
|
@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model;
|
||||
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* @see nu.marginalia.util.language.processing.SentenceExtractor
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
@AllArgsConstructor
|
||||
public class DocumentLanguageData {
|
||||
|
@ -1,16 +1,14 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.StringPool;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.HtmlTagCleaner;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
@ -24,25 +22,22 @@ import javax.inject.Inject;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static nu.marginalia.util.language.WordPatterns.*;
|
||||
|
||||
public class SentenceExtractor {
|
||||
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private final RDRPOSTagger rdrposTagger;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private boolean legacyMode = false;
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
|
||||
|
||||
private final ThreadLocal<StringPool> stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000));
|
||||
|
||||
|
||||
@SneakyThrows @Inject
|
||||
public SentenceExtractor(LanguageModels models) {
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
@ -66,6 +61,22 @@ public class SentenceExtractor {
|
||||
final String text = asText(doc);
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
String title = getTitle(doc, textSentences);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
}
|
||||
|
||||
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||
String title = doc.getElementsByTag("title").text() + " . " +
|
||||
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
|
||||
|
||||
@ -82,34 +93,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
String title = "";
|
||||
for (DocumentSentence textSentence : textSentences) {
|
||||
if (textSentence.length() > 0) {
|
||||
title = textSentence.originalSentence.toLowerCase();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
}
|
||||
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
return title;
|
||||
}
|
||||
|
||||
|
||||
@ -125,79 +109,95 @@ public class SentenceExtractor {
|
||||
return counts;
|
||||
}
|
||||
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
|
||||
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||
|
||||
private boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
private String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
s = new String(newChars, 0, pi);
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
if (s.isBlank())
|
||||
return "";
|
||||
}
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text) {
|
||||
var wordsAndSeps = splitSegment(text);
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
|
||||
|
||||
var words = wordsAndSeps.words;
|
||||
var seps = wordsAndSeps.separators;
|
||||
var lc = toLc(wordsAndSeps.words);
|
||||
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||
|
||||
return new DocumentSentence(
|
||||
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
);
|
||||
}
|
||||
|
||||
public String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public DocumentSentence[] extractSentencesFromString(String text) {
|
||||
String[] sentences;
|
||||
|
||||
String textNormalizedSpaces = normalizeSpaces(text);
|
||||
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// shitty fallback logic
|
||||
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||
}
|
||||
|
||||
sentences = preCleanSentences(sentences);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
final String[][] posTags = new String[sentences.length][];
|
||||
final String[][] tokensLc = new String[sentences.length][];
|
||||
final String[][] stemmedWords = new String[sentences.length][];
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
|
||||
tokens[i] = wordsAndSeps.words;
|
||||
separators[i] = wordsAndSeps.separators;
|
||||
if (tokens[i].length > 250) {
|
||||
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sPool = stringPool.get();
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokens[i] = sPool.internalize(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
// don't need to internalize this
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
|
||||
tokensLc[i] = sPool.internalize(tokensLc[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||
stemmedWords[i] = sPool.internalize(stemmedWords[i]);
|
||||
}
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
String fullString;
|
||||
|
||||
if (i == 0) {
|
||||
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
|
||||
}
|
||||
else {
|
||||
fullString = "";
|
||||
}
|
||||
|
||||
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
|
||||
private String[] preCleanSentences(String[] sentences) {
|
||||
|
||||
if (sentences.length > 250) {
|
||||
sentences = Arrays.copyOf(sentences, 250);
|
||||
}
|
||||
@ -212,53 +212,13 @@ public class SentenceExtractor {
|
||||
sentenceList.add(s);
|
||||
}
|
||||
}
|
||||
sentences = sentenceList.toArray(String[]::new);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
final String[][] posTags = new String[sentences.length][];
|
||||
final String[][] tokensLc = new String[sentences.length][];
|
||||
final String[][] stemmedWords = new String[sentences.length][];
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
|
||||
var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]);
|
||||
tokens[i] = wordsAndSeps.words;
|
||||
separators[i] = wordsAndSeps.separators;
|
||||
if (tokens[i].length > 250) {
|
||||
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokensLc[i] = toLc(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||
}
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
}
|
||||
return ret;
|
||||
return sentenceList.toArray(String[]::new);
|
||||
}
|
||||
|
||||
private String[] stemSentence(String[] strings) {
|
||||
String[] stemmed = new String[strings.length];
|
||||
for (int i = 0; i < stemmed.length; i++) {
|
||||
var sent = cleanPossessive(strings[i]);
|
||||
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(sent);
|
||||
}
|
||||
@ -269,27 +229,6 @@ public class SentenceExtractor {
|
||||
return stemmed;
|
||||
}
|
||||
|
||||
private String cleanPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("\'")) {
|
||||
return s.substring(0, end-1);
|
||||
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
|
||||
return s.substring(0, end-2).toLowerCase();
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private String[] toLc(String[] words) {
|
||||
String[] lower = new String[words.length];
|
||||
for (int i = 0; i < lower.length; i++) {
|
||||
lower[i] = cleanPossessive(words[i]).toLowerCase();
|
||||
}
|
||||
return lower;
|
||||
}
|
||||
|
||||
public String asText(Document dc) {
|
||||
|
||||
tagCleaner.clean(dc);
|
||||
@ -299,67 +238,6 @@ public class SentenceExtractor {
|
||||
return text.substring(0, (int) (text.length()*0.95));
|
||||
}
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
private static class WordsAndSeparators {
|
||||
String[] words;
|
||||
int[] separators;
|
||||
}
|
||||
|
||||
private WordsAndSeparators splitSegment(String segment) {
|
||||
var matcher = wordBreakPattern.matcher(segment);
|
||||
|
||||
List<String> words = new ArrayList<>(segment.length()/6);
|
||||
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
||||
|
||||
int start = 0;
|
||||
int wordStart = 0;
|
||||
while (wordStart <= segment.length()) {
|
||||
if (!matcher.find(wordStart)) {
|
||||
words.add(segment.substring(wordStart));
|
||||
separators.add(WordSeparator.SPACE);
|
||||
break;
|
||||
}
|
||||
|
||||
if (wordStart != matcher.start()) {
|
||||
words.add(segment.substring(wordStart, matcher.start()));
|
||||
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
||||
}
|
||||
wordStart = matcher.end();
|
||||
}
|
||||
|
||||
String[] parts = words.toArray(String[]::new);
|
||||
int length = 0;
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
||||
parts[i] = null;
|
||||
}
|
||||
else {
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
String[] ret = new String[length];
|
||||
int[] seps = new int[length];
|
||||
for (int i = 0, j=0; i < parts.length; i++) {
|
||||
if (parts[i] != null) {
|
||||
seps[j] = separators.getQuick(i);
|
||||
ret[j++] = parts[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
||||
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
||||
}
|
||||
return new WordsAndSeparators(ret, seps);
|
||||
}
|
||||
|
||||
|
||||
public boolean isLegacyMode() {
|
||||
return legacyMode;
|
||||
}
|
||||
public void setLegacyMode(boolean legacyMode) {
|
||||
this.legacyMode = legacyMode;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
|
||||
public class SentenceExtractorStringUtils {
|
||||
|
||||
public static String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
changed = true;
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
s = new String(newChars, 0, pi);
|
||||
}
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
}
|
||||
|
||||
if (s.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
private static boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
public static String toLowerCaseStripPossessive(String word) {
|
||||
String val = stripPossessive(word).toLowerCase();
|
||||
|
||||
if (Objects.equals(val, word)) {
|
||||
return word;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String[] toLowerCaseStripPossessive(String[] words) {
|
||||
String[] lc = new String[words.length];
|
||||
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
|
||||
return lc;
|
||||
}
|
||||
|
||||
public static String stripPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("'")) {
|
||||
return s.substring(0, end-1);
|
||||
}
|
||||
|
||||
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||
return s.substring(0, end-2);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.util.language.WordPatterns.*;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
public static class SeparatedSentence {
|
||||
String[] words;
|
||||
int[] separators;
|
||||
}
|
||||
|
||||
public static SeparatedSentence splitSegment(String segment) {
|
||||
var matcher = wordBreakPattern.matcher(segment);
|
||||
|
||||
List<String> words = new ArrayList<>(segment.length()/6);
|
||||
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
||||
|
||||
int wordStart = 0;
|
||||
while (wordStart <= segment.length()) {
|
||||
if (!matcher.find(wordStart)) {
|
||||
words.add(segment.substring(wordStart));
|
||||
separators.add(WordSeparator.SPACE);
|
||||
break;
|
||||
}
|
||||
|
||||
if (wordStart != matcher.start()) {
|
||||
words.add(segment.substring(wordStart, matcher.start()));
|
||||
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
||||
}
|
||||
wordStart = matcher.end();
|
||||
}
|
||||
|
||||
String[] parts = words.toArray(String[]::new);
|
||||
int length = 0;
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
||||
parts[i] = null;
|
||||
}
|
||||
else {
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
String[] ret = new String[length];
|
||||
int[] seps = new int[length];
|
||||
for (int i = 0, j=0; i < parts.length; i++) {
|
||||
if (parts[i] != null) {
|
||||
seps[j] = separators.getQuick(i);
|
||||
ret[j++] = parts[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
||||
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
||||
}
|
||||
return new SeparatedSentence(ret, seps);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||
@ -18,11 +19,10 @@ import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@ -101,12 +101,15 @@ public class TermFrequencyDict {
|
||||
|
||||
fjp.execute(() -> {
|
||||
|
||||
TLongHashSet words = new TLongHashSet(10_000);
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
|
||||
if (doc.documentBody == null)
|
||||
continue;
|
||||
docCount.incrementAndGet();
|
||||
|
||||
Document parsed = Jsoup.parse(doc.documentBody);
|
||||
Document parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||
@ -115,28 +118,30 @@ public class TermFrequencyDict {
|
||||
return;
|
||||
}
|
||||
|
||||
Set<String> words = new HashSet<>(10_000);
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent) {
|
||||
words.add(word.stemmed());
|
||||
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
}
|
||||
|
||||
fjp.execute(() -> {
|
||||
synchronized (counts) {
|
||||
for (var word : words) {
|
||||
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
||||
}
|
||||
}
|
||||
});
|
||||
synchronized (counts) {
|
||||
words.forEach(w -> {
|
||||
counts.adjustOrPutValue(w, 1, 1);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
words.clear();
|
||||
}
|
||||
|
||||
System.out.println(domain.domain + "\t" + counts.size());
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
fjp.shutdown();
|
||||
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
||||
fjp.awaitTermination(10, TimeUnit.DAYS);
|
||||
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||
synchronized (counts) {
|
||||
@ -155,14 +160,6 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
System.out.println(docCount.get());
|
||||
//
|
||||
// counts.forEachEntry((w,c) -> {
|
||||
// if (c > 3L) {
|
||||
// System.out.println(w + ":" + c);
|
||||
// }
|
||||
// return true;
|
||||
// });
|
||||
|
||||
}
|
||||
|
||||
public static long getStringHash(String s) {
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex;
|
||||
import nu.marginalia.util.gregex.GuardedRegexFactory;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
@ -178,11 +178,13 @@ public class DocumentProcessor {
|
||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
|
||||
String documentBody = crawledDocument.documentBody.decode();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
Document doc = Jsoup.parse(documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
// I've never encountered a website where this hasn't been a severe indicator
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
|
@ -8,7 +8,7 @@ import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
@ -25,12 +25,12 @@ public class QueryVariants {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private final NGramBloomFilter nGramBloomFilter;
|
||||
private final EnglishDictionary englishDictionary;
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
|
||||
|
||||
@Inject
|
||||
public QueryVariants(LanguageModels lm,
|
||||
@ -40,7 +40,7 @@ public class QueryVariants {
|
||||
this.nGramBloomFilter = nGramBloomFilter;
|
||||
this.englishDictionary = englishDictionary;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
this.sentenceExtractor = new SentenceExtractor(lm);
|
||||
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
|
||||
this.dict = dict;
|
||||
}
|
||||
|
||||
@ -78,10 +78,8 @@ public class QueryVariants {
|
||||
|
||||
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
|
||||
|
||||
logger.debug("Q: {}", query);
|
||||
logger.debug("QAS: {}", joinedQuery);
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery);
|
||||
var se = sentenceExtractor.get();
|
||||
var sentence = se.extractSentence(joinedQuery.joinedQuery);
|
||||
|
||||
for (int i = 0; i < sentence.posTags.length; i++) {
|
||||
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.tools;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.ConverterModule;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
@ -63,7 +63,7 @@ public class ConverterLogicTestTool {
|
||||
if (doc.documentBody == null) continue;
|
||||
|
||||
Runnable task = () -> {
|
||||
var parsed = Jsoup.parse(doc.documentBody);
|
||||
var parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
var dld = se.extractSentences(parsed);
|
||||
|
@ -6,15 +6,18 @@ import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
@ -26,6 +29,7 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@Tag("slow")
|
||||
class SentenceExtractorTest {
|
||||
@ -38,7 +42,6 @@ class SentenceExtractorTest {
|
||||
|
||||
newSe = new SentenceExtractor(lm);
|
||||
legacySe = new SentenceExtractor(lm);
|
||||
legacySe.setLegacyMode(true);
|
||||
}
|
||||
|
||||
|
||||
@ -83,7 +86,7 @@ class SentenceExtractorTest {
|
||||
var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
||||
for (WordSpan kw : keywordExtractor.getProperNames(sentence)) {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
@ -145,7 +148,22 @@ class SentenceExtractorTest {
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
|
||||
System.out.println(newRes);
|
||||
|
||||
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
|
||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||
.limit(100)
|
||||
.map(Pair::getKey)
|
||||
.toArray(String[]::new);
|
||||
System.out.println(Arrays.toString(terms));
|
||||
|
||||
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i))))
|
||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
|
||||
.limit(100)
|
||||
.map(Pair::getKey)
|
||||
.toArray(String[]::new);
|
||||
System.out.println(Arrays.toString(terms2));
|
||||
System.out.println("--");
|
||||
}
|
||||
System.out.println(System.currentTimeMillis() - st);
|
||||
|
||||
|
@ -4,7 +4,7 @@ import nu.marginalia.util.ParallelPipe;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
||||
|
@ -6,7 +6,7 @@ import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.util.language.DocumentDebugger;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.query;
|
||||
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
|
Loading…
Reference in New Issue
Block a user