diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index fb081c95..a693dcdc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.KeywordCounter; import nu.marginalia.util.language.processing.KeywordExtractor; import nu.marginalia.util.language.processing.NameCounter; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.tag.WordSeparator; @@ -68,9 +68,6 @@ public class DocumentDebugger { Set reps = new HashSet<>(); -// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); -// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); - try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) { for (var sent : languageData.titleSentences) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java index 7d829fd6..89b95fd0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/DocumentLanguageData.java @@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.AllArgsConstructor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import java.util.Arrays; import java.util.stream.Stream; /** - * @see nu.marginalia.util.language.processing.SentenceExtractor + * @see SentenceExtractor */ @AllArgsConstructor public class DocumentLanguageData { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java similarity index 54% rename from marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java rename to marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java index 08886928..2957eaa9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractor.java @@ -1,16 +1,14 @@ -package nu.marginalia.util.language.processing; +package nu.marginalia.util.language.processing.sentence; import com.github.datquocnguyen.RDRPOSTagger; import com.github.jknack.handlebars.internal.lang3.StringUtils; -import gnu.trove.list.array.TIntArrayList; import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; -import lombok.Getter; import lombok.SneakyThrows; +import nu.marginalia.util.StringPool; import nu.marginalia.util.language.conf.LanguageModels; +import nu.marginalia.util.language.processing.HtmlTagCleaner; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentSentence; -import nu.marginalia.util.language.processing.model.tag.WordSeparator; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; @@ -24,25 +22,22 @@ import javax.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Optional; +import java.util.*; import java.util.regex.Pattern; -import static nu.marginalia.util.language.WordPatterns.*; - public class SentenceExtractor { private SentenceDetectorME sentenceDetector; private final RDRPOSTagger rdrposTagger; private final PorterStemmer porterStemmer = new PorterStemmer(); - private boolean legacyMode = false; private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner(); + private final ThreadLocal stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000)); + + @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { @@ -66,6 +61,22 @@ public class SentenceExtractor { final String text = asText(doc); final DocumentSentence[] textSentences = extractSentencesFromString(text); + String title = getTitle(doc, textSentences); + + TObjectIntHashMap counts = calculateWordCounts(textSentences); + var titleSentences = extractSentencesFromString(title.toLowerCase()); + return new DocumentLanguageData(textSentences, titleSentences, counts); + } + + public DocumentLanguageData extractSentences(String text, String title) { + final DocumentSentence[] textSentences = extractSentencesFromString(text); + + TObjectIntHashMap counts = calculateWordCounts(textSentences); + + return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); + } + + private String getTitle(Document doc, DocumentSentence[] textSentences) { String title = doc.getElementsByTag("title").text() + " . " + Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); @@ -82,34 +93,7 @@ public class SentenceExtractor { } } - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts); - } - - public DocumentLanguageData extractSentences(String text) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); - - String title = ""; - for (DocumentSentence textSentence : textSentences) { - if (textSentence.length() > 0) { - title = textSentence.originalSentence.toLowerCase(); - break; - } - } - - TObjectIntHashMap counts = calculateWordCounts(textSentences); - - return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); - } - - - public DocumentLanguageData extractSentences(String text, String title) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); - - TObjectIntHashMap counts = calculateWordCounts(textSentences); - - return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts); + return title; } @@ -125,79 +109,95 @@ public class SentenceExtractor { return counts; } - private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - -// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); - - private boolean isBadChar(char c) { - if (c >= 'a' && c <= 'z') return false; - if (c >= 'A' && c <= 'Z') return false; - if (c >= '0' && c <= '9') return false; - if ("_#@.".indexOf(c) >= 0) return false; - if (c >= '\u00C0' && c <= '\u00D6') return false; - if (c >= '\u00D8' && c <= '\u00F6') return false; - if (c >= '\u00F8' && c <= '\u00FF') return false; - - return true; - } - private String sanitizeString(String s) { - char[] newChars = new char[s.length()]; - int pi = 0; - - for (int i = 0; i < newChars.length; i++) { - char c = s.charAt(i); - if (!isBadChar(c)) { - newChars[pi++] = c; - } - else { - newChars[pi++] = ' '; - } - } - - s = new String(newChars, 0, pi); - - if (s.startsWith(".")) { - s = s.substring(1); - if (s.isBlank()) - return ""; - } - return s; - - } - public DocumentSentence extractSentence(String text) { - var wordsAndSeps = splitSegment(text); + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text); var words = wordsAndSeps.words; var seps = wordsAndSeps.separators; - var lc = toLc(wordsAndSeps.words); + var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); return new DocumentSentence( - sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) ); } - public String normalizeSpaces(String s) { - if (s.indexOf('\t') >= 0) { - s = s.replace('\t', ' '); - } - if (s.indexOf('\n') >= 0) { - s = s.replace('\n', ' '); - } - return s; - } - public DocumentSentence[] extractSentencesFromString(String text) { String[] sentences; - String textNormalizedSpaces = normalizeSpaces(text); + String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); try { sentences = sentenceDetector.sentDetect(textNormalizedSpaces); } catch (Exception ex) { + // shitty fallback logic sentences = StringUtils.split(textNormalizedSpaces, '.'); } + sentences = preCleanSentences(sentences); + + final String[][] tokens = new String[sentences.length][]; + final int[][] separators = new int[sentences.length][]; + final String[][] posTags = new String[sentences.length][]; + final String[][] tokensLc = new String[sentences.length][]; + final String[][] stemmedWords = new String[sentences.length][]; + + for (int i = 0; i < tokens.length; i++) { + + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]); + tokens[i] = wordsAndSeps.words; + separators[i] = wordsAndSeps.separators; + if (tokens[i].length > 250) { + tokens[i] = Arrays.copyOf(tokens[i], 250); + separators[i] = Arrays.copyOf(separators[i], 250); + } + for (int j = 0; j < tokens[i].length; j++) { + while (tokens[i][j].endsWith(".")) { + tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); + } + } + } + + var sPool = stringPool.get(); + + for (int i = 0; i < tokens.length; i++) { + tokens[i] = sPool.internalize(tokens[i]); + } + + for (int i = 0; i < tokens.length; i++) { + posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); + // don't need to internalize this + } + + for (int i = 0; i < tokens.length; i++) { + tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]); + tokensLc[i] = sPool.internalize(tokensLc[i]); + } + + for (int i = 0; i < tokens.length; i++) { + stemmedWords[i] = stemSentence(tokensLc[i]); + stemmedWords[i] = sPool.internalize(stemmedWords[i]); + } + + DocumentSentence[] ret = new DocumentSentence[sentences.length]; + for (int i = 0; i < ret.length; i++) { + String fullString; + + if (i == 0) { + fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]); + } + else { + fullString = ""; + } + + ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + } + return ret; + } + + private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); + + private String[] preCleanSentences(String[] sentences) { + if (sentences.length > 250) { sentences = Arrays.copyOf(sentences, 250); } @@ -212,53 +212,13 @@ public class SentenceExtractor { sentenceList.add(s); } } - sentences = sentenceList.toArray(String[]::new); - - final String[][] tokens = new String[sentences.length][]; - final int[][] separators = new int[sentences.length][]; - final String[][] posTags = new String[sentences.length][]; - final String[][] tokensLc = new String[sentences.length][]; - final String[][] stemmedWords = new String[sentences.length][]; - - for (int i = 0; i < tokens.length; i++) { - - var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]); - tokens[i] = wordsAndSeps.words; - separators[i] = wordsAndSeps.separators; - if (tokens[i].length > 250) { - tokens[i] = Arrays.copyOf(tokens[i], 250); - separators[i] = Arrays.copyOf(separators[i], 250); - } - for (int j = 0; j < tokens[i].length; j++) { - while (tokens[i][j].endsWith(".")) { - tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); - } - } - } - - for (int i = 0; i < tokens.length; i++) { - posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - tokensLc[i] = toLc(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - stemmedWords[i] = stemSentence(tokensLc[i]); - } - - DocumentSentence[] ret = new DocumentSentence[sentences.length]; - for (int i = 0; i < ret.length; i++) { - ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); - } - return ret; + return sentenceList.toArray(String[]::new); } private String[] stemSentence(String[] strings) { String[] stemmed = new String[strings.length]; for (int i = 0; i < stemmed.length; i++) { - var sent = cleanPossessive(strings[i]); + var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]); try { stemmed[i] = porterStemmer.stem(sent); } @@ -269,27 +229,6 @@ public class SentenceExtractor { return stemmed; } - private String cleanPossessive(String s) { - int end = s.length(); - - if (s.endsWith("\'")) { - return s.substring(0, end-1); - } else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) { - return s.substring(0, end-2).toLowerCase(); - } - else { - return s; - } - } - - private String[] toLc(String[] words) { - String[] lower = new String[words.length]; - for (int i = 0; i < lower.length; i++) { - lower[i] = cleanPossessive(words[i]).toLowerCase(); - } - return lower; - } - public String asText(Document dc) { tagCleaner.clean(dc); @@ -299,67 +238,6 @@ public class SentenceExtractor { return text.substring(0, (int) (text.length()*0.95)); } - @AllArgsConstructor @Getter - private static class WordsAndSeparators { - String[] words; - int[] separators; - } - private WordsAndSeparators splitSegment(String segment) { - var matcher = wordBreakPattern.matcher(segment); - - List words = new ArrayList<>(segment.length()/6); - TIntArrayList separators = new TIntArrayList(segment.length()/6); - - int start = 0; - int wordStart = 0; - while (wordStart <= segment.length()) { - if (!matcher.find(wordStart)) { - words.add(segment.substring(wordStart)); - separators.add(WordSeparator.SPACE); - break; - } - - if (wordStart != matcher.start()) { - words.add(segment.substring(wordStart, matcher.start())); - separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA); - } - wordStart = matcher.end(); - } - - String[] parts = words.toArray(String[]::new); - int length = 0; - for (int i = 0; i < parts.length; i++) { - if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) { - parts[i] = null; - } - else { - length++; - } - } - - String[] ret = new String[length]; - int[] seps = new int[length]; - for (int i = 0, j=0; i < parts.length; i++) { - if (parts[i] != null) { - seps[j] = separators.getQuick(i); - ret[j++] = parts[i]; - } - } - - for (int i = 0; i < ret.length; i++) { - if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); } - if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); } - } - return new WordsAndSeparators(ret, seps); - } - - - public boolean isLegacyMode() { - return legacyMode; - } - public void setLegacyMode(boolean legacyMode) { - this.legacyMode = legacyMode; - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java new file mode 100644 index 00000000..08a1605c --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceExtractorStringUtils.java @@ -0,0 +1,93 @@ +package nu.marginalia.util.language.processing.sentence; + +import java.util.Arrays; +import java.util.Objects; + +public class SentenceExtractorStringUtils { + + public static String sanitizeString(String s) { + char[] newChars = new char[s.length()]; + int pi = 0; + boolean changed = false; + for (int i = 0; i < newChars.length; i++) { + char c = s.charAt(i); + if (!isBadChar(c)) { + newChars[pi++] = c; + } + else { + changed = true; + newChars[pi++] = ' '; + } + } + + if (changed) { + s = new String(newChars, 0, pi); + } + + if (s.startsWith(".")) { + s = s.substring(1); + } + + if (s.isBlank()) { + return ""; + } + + return s; + + } + + private static boolean isBadChar(char c) { + if (c >= 'a' && c <= 'z') return false; + if (c >= 'A' && c <= 'Z') return false; + if (c >= '0' && c <= '9') return false; + if ("_#@.".indexOf(c) >= 0) return false; + if (c >= '\u00C0' && c <= '\u00D6') return false; + if (c >= '\u00D8' && c <= '\u00F6') return false; + if (c >= '\u00F8' && c <= '\u00FF') return false; + + return true; + } + + public static String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + + + public static String toLowerCaseStripPossessive(String word) { + String val = stripPossessive(word).toLowerCase(); + + if (Objects.equals(val, word)) { + return word; + } + + return val; + } + + public static String[] toLowerCaseStripPossessive(String[] words) { + String[] lc = new String[words.length]; + Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i])); + return lc; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java new file mode 100644 index 00000000..6a4516cf --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/sentence/SentenceSegmentSplitter.java @@ -0,0 +1,72 @@ +package nu.marginalia.util.language.processing.sentence; + +import gnu.trove.list.array.TIntArrayList; +import lombok.AllArgsConstructor; +import lombok.Getter; +import nu.marginalia.util.language.processing.model.tag.WordSeparator; + +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.util.language.WordPatterns.*; + +public class SentenceSegmentSplitter { + + + @AllArgsConstructor + @Getter + public static class SeparatedSentence { + String[] words; + int[] separators; + } + + public static SeparatedSentence splitSegment(String segment) { + var matcher = wordBreakPattern.matcher(segment); + + List words = new ArrayList<>(segment.length()/6); + TIntArrayList separators = new TIntArrayList(segment.length()/6); + + int wordStart = 0; + while (wordStart <= segment.length()) { + if (!matcher.find(wordStart)) { + words.add(segment.substring(wordStart)); + separators.add(WordSeparator.SPACE); + break; + } + + if (wordStart != matcher.start()) { + words.add(segment.substring(wordStart, matcher.start())); + separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA); + } + wordStart = matcher.end(); + } + + String[] parts = words.toArray(String[]::new); + int length = 0; + for (int i = 0; i < parts.length; i++) { + if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) { + parts[i] = null; + } + else { + length++; + } + } + + String[] ret = new String[length]; + int[] seps = new int[length]; + for (int i = 0, j=0; i < parts.length; i++) { + if (parts[i] != null) { + seps[j] = separators.getQuick(i); + ret[j++] = parts[i]; + } + } + + for (int i = 0; i < ret.length; i++) { + if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); } + if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); } + } + return new SeparatedSentence(ret, seps); + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java index 2a29e2a4..4d87ec96 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/dict/TermFrequencyDict.java @@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict; import ca.rmen.porterstemmer.PorterStemmer; import gnu.trove.map.hash.TLongIntHashMap; +import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter; @@ -18,11 +19,10 @@ import javax.annotation.Nullable; import javax.inject.Inject; import javax.inject.Singleton; import java.io.*; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; -import java.util.HashSet; -import java.util.Set; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -101,12 +101,15 @@ public class TermFrequencyDict { fjp.execute(() -> { + TLongHashSet words = new TLongHashSet(10_000); + for (var doc : domain.doc) { + if (doc.documentBody == null) continue; docCount.incrementAndGet(); - Document parsed = Jsoup.parse(doc.documentBody); + Document parsed = Jsoup.parse(doc.documentBody.decode()); parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); @@ -115,28 +118,30 @@ public class TermFrequencyDict { return; } - Set words = new HashSet<>(10_000); - for (var sent : dld.sentences) { for (var word : sent) { - words.add(word.stemmed()); + words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } } - fjp.execute(() -> { - synchronized (counts) { - for (var word : words) { - counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1); - } - } - }); + synchronized (counts) { + words.forEach(w -> { + counts.adjustOrPutValue(w, 1, 1); + return true; + }); + } + words.clear(); } + + System.out.println(domain.domain + "\t" + counts.size()); }); + + } fjp.shutdown(); - fjp.awaitTermination(10, TimeUnit.SECONDS); + fjp.awaitTermination(10, TimeUnit.DAYS); try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) { synchronized (counts) { @@ -155,14 +160,6 @@ public class TermFrequencyDict { } System.out.println(docCount.get()); -// -// counts.forEachEntry((w,c) -> { -// if (c > 3L) { -// System.out.println(w + ":" + c); -// } -// return true; -// }); - } public static long getStringHash(String s) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 339389fe..71ac3945 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex; import nu.marginalia.util.gregex.GuardedRegexFactory; import nu.marginalia.util.language.LanguageFilter; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; @@ -178,11 +178,13 @@ public class DocumentProcessor { private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) { + String documentBody = crawledDocument.documentBody.decode(); + + if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } - Document doc = Jsoup.parse(crawledDocument.documentBody); + Document doc = Jsoup.parse(documentBody); if (AcceptableAds.hasAcceptableAdsTag(doc)) { // I've never encountered a website where this hasn't been a severe indicator diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index 141a3904..3c8bda78 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; import com.google.inject.Inject; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index 5f4d206c..67c51751 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -1,7 +1,7 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java index f3e75f91..6d42b599 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryVariants.java @@ -8,7 +8,7 @@ import lombok.Getter; import lombok.ToString; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; @@ -25,12 +25,12 @@ public class QueryVariants { private final Logger logger = LoggerFactory.getLogger(getClass()); private final KeywordExtractor keywordExtractor; - private final SentenceExtractor sentenceExtractor; private final TermFrequencyDict dict; private final PorterStemmer ps = new PorterStemmer(); private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; + private final ThreadLocal sentenceExtractor; @Inject public QueryVariants(LanguageModels lm, @@ -40,7 +40,7 @@ public class QueryVariants { this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); - this.sentenceExtractor = new SentenceExtractor(lm); + this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); this.dict = dict; } @@ -78,10 +78,8 @@ public class QueryVariants { final TreeMap> byStart = new TreeMap<>(); - logger.debug("Q: {}", query); - logger.debug("QAS: {}", joinedQuery); - - var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery); + var se = sentenceExtractor.get(); + var sentence = se.extractSentence(joinedQuery.joinedQuery); for (int i = 0; i < sentence.posTags.length; i++) { if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java index 5288eac1..78d90ccb 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ConverterLogicTestTool.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.tools; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.converting.ConverterModule; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; @@ -63,7 +63,7 @@ public class ConverterLogicTestTool { if (doc.documentBody == null) continue; Runnable task = () -> { - var parsed = Jsoup.parse(doc.documentBody); + var parsed = Jsoup.parse(doc.documentBody.decode()); parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index 08dcef4c..344973e4 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -6,15 +6,18 @@ import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.util.language.processing.model.KeywordMetadata; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; +import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; +import org.apache.commons.lang3.tuple.Pair; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -26,6 +29,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.regex.Pattern; +import java.util.stream.IntStream; @Tag("slow") class SentenceExtractorTest { @@ -38,7 +42,6 @@ class SentenceExtractorTest { newSe = new SentenceExtractor(lm); legacySe = new SentenceExtractor(lm); - legacySe.setLegacyMode(true); } @@ -83,7 +86,7 @@ class SentenceExtractorTest { var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); Map counts = new HashMap<>(); for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getNames(sentence)) { + for (WordSpan kw : keywordExtractor.getProperNames(sentence)) { if (kw.end + 2 >= sentence.length()) { continue; } @@ -145,7 +148,22 @@ class SentenceExtractorTest { for (var file : Objects.requireNonNull(data.toFile().listFiles())) { var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); - System.out.println(newRes); + + var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) + .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) + .limit(100) + .map(Pair::getKey) + .toArray(String[]::new); + System.out.println(Arrays.toString(terms)); + + var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new EdgePageWordMetadata(newRes.metadata.get(i)))) + .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) + .filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects)) + .limit(100) + .map(Pair::getKey) + .toArray(String[]::new); + System.out.println(Arrays.toString(terms2)); + System.out.println("--"); } System.out.println(System.currentTimeMillis() - st); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java index b0c98dc9..6971d9b7 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostsTest.java @@ -4,7 +4,7 @@ import nu.marginalia.util.ParallelPipe; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java index a2b60163..40a58e93 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaTest.java @@ -6,7 +6,7 @@ import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.DocumentDebugger; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index f6d74999..b2477ca8 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.util.language.processing.sentence.SentenceExtractor; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll;