diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java index 6cee3058..d4c0232e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/DocumentDebugger.java @@ -69,7 +69,7 @@ public class DocumentDebugger { Set reps = new HashSet<>(); // kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); - kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed)); + kc.count(languageData).forEach(rep -> reps.add(rep.stemmed)); try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index fbe0b8de..570d2462 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -39,13 +39,12 @@ public class DocumentKeywordExtractor { public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) { - var titleWords = extractTitleWords(documentLanguageData); - - var wordsTfIdf = tfIdfCounter.count(documentLanguageData, 0.75); - var wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); - var wordsNamesAll = nameCounter.count(documentLanguageData, 1); - var subjects = subjectCounter.count(documentLanguageData); + List titleWords = extractTitleWords(documentLanguageData); + List wordsTfIdf = tfIdfCounter.count(documentLanguageData); + List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); + List wordsNamesAll = nameCounter.count(documentLanguageData, 1); + List subjects = subjectCounter.count(documentLanguageData); List wordsLongName = longNameCounter.count(documentLanguageData); int totalSize = wordsTfIdf.size(); @@ -55,8 +54,8 @@ public class DocumentKeywordExtractor { List topKeywords = new ArrayList<>(totalSize / 2); for(var v : wordsTfIdf) { - if (topKeywords.size() < totalSize / 10) topKeywords.add(v); - else if (midKeywords.size() < totalSize / 5) midKeywords.add(v); + if (topKeywords.size() <= totalSize / 10) topKeywords.add(v); + else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v); else lowKeywords.add(v); } @@ -125,17 +124,18 @@ public class DocumentKeywordExtractor { } } - return counts.entrySet().stream().filter(c2 -> c2.getValue()>=1) - .sorted(Comparator.comparing(this::value)) + return counts.entrySet().stream() + .sorted(Comparator.comparing(e -> { + double N = 11820118.; // Number of documents in term freq dictionary + + // Caveat: This is actually the *negated* term score, because the second logarithm has + // its parameter inverted (log(a^b) = b log(a); here b = -1) + return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); + })) .map(Map.Entry::getKey) - .limit(512).collect(Collectors.toSet()); + .limit(512).collect(Collectors.toCollection(LinkedHashSet::new)); } - private double value(Map.Entry e) { - double N = 11820118.; // Number of documents in term freq dictionary - - return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); - } public EdgePageWords createWords(IndexBlock block, Collection words) { return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java index fbe0191c..49cee9bf 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java @@ -1,15 +1,12 @@ package nu.marginalia.util.language.processing; import nu.marginalia.util.language.processing.model.DocumentLanguageData; -import nu.marginalia.util.language.processing.model.DocumentSentence; import nu.marginalia.util.language.processing.model.WordRep; -import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.IntStream; public class KeywordCounter { private final KeywordExtractor keywordExtractor; @@ -20,58 +17,29 @@ public class KeywordCounter { this.keywordExtractor = keywordExtractor; } - public List count(DocumentLanguageData dld, double cutoff) { + public List count(DocumentLanguageData dld) { HashMap counts = new HashMap<>(1000); - HashMap> instances = new HashMap<>(1000); + HashMap> instances = new HashMap<>(1000); - for (int i = 0; i < dld.sentences.length; i++) { - DocumentSentence sent = dld.sentences[i]; - double value = 1.0 / Math.log(1+i); + for (var sent : dld.sentences) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { - var stemmed = sent.constructStemmedWordFromSpan(span); - if (stemmed.isBlank()) - continue; - counts.merge(stemmed, value, Double::sum); + String stemmed = sent.constructStemmedWordFromSpan(span); - instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(sent.constructWordFromSpan(span)); + counts.merge(stemmed, 1., Double::sum); + instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span)); } } - var topWords = counts.entrySet().stream() - .filter(w -> w.getValue() > cutoff) + return counts.entrySet().stream() + .filter(e -> e.getValue() > 1) .sorted(Comparator.comparing(this::getTermValue)) - .limit(Math.min(100, counts.size()/2)) .map(Map.Entry::getKey) + .flatMap(w -> instances.get(w).stream()) + .filter(w -> w.word.length() > 1) + .limit(150) .collect(Collectors.toList()); - - var topWordsSet = new HashSet<>(topWords); - - final Set keywords = new HashSet<>(); - - for (var sentence : dld.sentences) { - for (WordSpan kw : keywordExtractor.getKeywordsFromSentence(sentence)) { - String stemmedWord = sentence.constructStemmedWordFromSpan(kw); - if (topWords.contains(stemmedWord)) { - keywords.add(new WordRep(sentence, kw)); - } - } - } - - for (var sentence : dld.sentences) { - for (var kw : keywordExtractor.getKeywordsFromSentenceStrict(sentence, topWordsSet, true)) { - keywords.add(new WordRep(sentence, kw)); - } - } - - Map sortOrder = IntStream.range(0, topWords.size()).boxed().collect(Collectors.toMap(topWords::get, i->i)); - - Comparator comp = Comparator.comparing(wr -> sortOrder.getOrDefault(wr.stemmed, topWords.size())); - - var ret = new ArrayList<>(keywords); - ret.sort(comp); - return ret; } private static final Pattern separator = Pattern.compile("_"); @@ -86,7 +54,11 @@ public class KeywordCounter { } double value(String key, double value) { - return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.); + double freq = dict.getTermFreqStemmed(key); + if (freq < 1) { + freq = 10; + } + return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java index 7c976e24..3943e046 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java @@ -56,7 +56,7 @@ public class LongNameCounter { } double value(String key, double value) { - return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.); + return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java index d21b4904..80ff77f5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java @@ -5,7 +5,9 @@ import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import java.util.*; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.stream.Collectors; public class SubjectCounter { @@ -15,6 +17,14 @@ public class SubjectCounter { this.keywordExtractor = keywordExtractor; } + // Seeks out subjects in a sentence by constructs like + // + // [Name] (Verbs) (the|a|Adverb|Verb) ... + // e.g. + // + // Greeks bearing gifts -> Greeks + // Steve McQueen drove fast | cars -> Steve McQueen + public List count(DocumentLanguageData dld) { Map counts = new HashMap<>(); @@ -27,9 +37,10 @@ public class SubjectCounter { || sentence.separators[kw.end + 1] == WordSeparator.COMMA) break; - if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end])) - && ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB")) - ) { + String nextTag = sentence.posTags[kw.end]; + String nextNextTag = sentence.posTags[kw.end+1]; + + if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) { counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum); } } @@ -43,4 +54,16 @@ public class SubjectCounter { .collect(Collectors.toList()); } + private boolean isDetOrAdverbOrVerb(String posTag) { + return "DT".equals(posTag) // determinant + || "RB".equals(posTag) // adverb + || posTag.startsWith("VB") // verb + || posTag.startsWith("JJ"); // adjective + } + + boolean isVerb(String posTag) { + return posTag.startsWith("VB") + && !posTag.equals("VB"); // not interested in the infinitive + } + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index a347d2e4..14a654d9 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -11,7 +11,6 @@ public enum IndexBlock { Meta(7, 7), PositionWords(8, 4.5), NamesWords(9, 5), - TermFreq(10, 10), Topic(11, 0.5); public final int id; diff --git a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index 7706e8d1..3e3bd58f 100644 --- a/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third_party/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -256,8 +256,8 @@ public class ZIMReader { try { getArticleData(consumer, pos, blobs); } - catch (IOException ex) { - + catch (Exception ex) { + ex.printStackTrace(); } }); @@ -384,7 +384,12 @@ public class ZIMReader { rb = is.read(data, trb, data.length - trb); trb += rb; } - consumer.accept(blobToUrl.get(blobNumber), new String(data)); + try { + consumer.accept(blobToUrl.get(blobNumber), new String(data)); + } + catch (Exception ex) { + ex.printStackTrace(); + } } } System.out.println(clusterNumber + " " + blobToUrl.size());