mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Tweaks to keyword extraction
This commit is contained in:
parent
64844e1db2
commit
825dea839d
@ -69,7 +69,7 @@ public class DocumentDebugger {
|
|||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||||
kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||||
|
|
||||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||||
|
|
||||||
|
@ -39,13 +39,12 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
var titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
var wordsTfIdf = tfIdfCounter.count(documentLanguageData, 0.75);
|
|
||||||
var wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
|
||||||
var wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
|
||||||
var subjects = subjectCounter.count(documentLanguageData);
|
|
||||||
|
|
||||||
|
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
|
||||||
|
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||||
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||||
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
||||||
|
|
||||||
int totalSize = wordsTfIdf.size();
|
int totalSize = wordsTfIdf.size();
|
||||||
@ -55,8 +54,8 @@ public class DocumentKeywordExtractor {
|
|||||||
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
||||||
|
|
||||||
for(var v : wordsTfIdf) {
|
for(var v : wordsTfIdf) {
|
||||||
if (topKeywords.size() < totalSize / 10) topKeywords.add(v);
|
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
|
||||||
else if (midKeywords.size() < totalSize / 5) midKeywords.add(v);
|
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
|
||||||
else lowKeywords.add(v);
|
else lowKeywords.add(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,17 +124,18 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return counts.entrySet().stream().filter(c2 -> c2.getValue()>=1)
|
return counts.entrySet().stream()
|
||||||
.sorted(Comparator.comparing(this::value))
|
.sorted(Comparator.comparing(e -> {
|
||||||
|
double N = 11820118.; // Number of documents in term freq dictionary
|
||||||
|
|
||||||
|
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||||
|
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||||
|
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
||||||
|
}))
|
||||||
.map(Map.Entry::getKey)
|
.map(Map.Entry::getKey)
|
||||||
.limit(512).collect(Collectors.toSet());
|
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
private double value(Map.Entry<String, Integer> e) {
|
|
||||||
double N = 11820118.; // Number of documents in term freq dictionary
|
|
||||||
|
|
||||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
|
||||||
}
|
|
||||||
|
|
||||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||||
|
@ -1,15 +1,12 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
|
||||||
public class KeywordCounter {
|
public class KeywordCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
@ -20,58 +17,29 @@ public class KeywordCounter {
|
|||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld, double cutoff) {
|
public List<WordRep> count(DocumentLanguageData dld) {
|
||||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||||
HashMap<String, HashSet<String>> instances = new HashMap<>(1000);
|
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||||
|
|
||||||
for (int i = 0; i < dld.sentences.length; i++) {
|
for (var sent : dld.sentences) {
|
||||||
DocumentSentence sent = dld.sentences[i];
|
|
||||||
double value = 1.0 / Math.log(1+i);
|
|
||||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
|
||||||
if (stemmed.isBlank())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
counts.merge(stemmed, value, Double::sum);
|
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
|
|
||||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(sent.constructWordFromSpan(span));
|
counts.merge(stemmed, 1., Double::sum);
|
||||||
|
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var topWords = counts.entrySet().stream()
|
return counts.entrySet().stream()
|
||||||
.filter(w -> w.getValue() > cutoff)
|
.filter(e -> e.getValue() > 1)
|
||||||
.sorted(Comparator.comparing(this::getTermValue))
|
.sorted(Comparator.comparing(this::getTermValue))
|
||||||
.limit(Math.min(100, counts.size()/2))
|
|
||||||
.map(Map.Entry::getKey)
|
.map(Map.Entry::getKey)
|
||||||
|
.flatMap(w -> instances.get(w).stream())
|
||||||
|
.filter(w -> w.word.length() > 1)
|
||||||
|
.limit(150)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
var topWordsSet = new HashSet<>(topWords);
|
|
||||||
|
|
||||||
final Set<WordRep> keywords = new HashSet<>();
|
|
||||||
|
|
||||||
for (var sentence : dld.sentences) {
|
|
||||||
for (WordSpan kw : keywordExtractor.getKeywordsFromSentence(sentence)) {
|
|
||||||
String stemmedWord = sentence.constructStemmedWordFromSpan(kw);
|
|
||||||
if (topWords.contains(stemmedWord)) {
|
|
||||||
keywords.add(new WordRep(sentence, kw));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var sentence : dld.sentences) {
|
|
||||||
for (var kw : keywordExtractor.getKeywordsFromSentenceStrict(sentence, topWordsSet, true)) {
|
|
||||||
keywords.add(new WordRep(sentence, kw));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Map<String, Integer> sortOrder = IntStream.range(0, topWords.size()).boxed().collect(Collectors.toMap(topWords::get, i->i));
|
|
||||||
|
|
||||||
Comparator<WordRep> comp = Comparator.comparing(wr -> sortOrder.getOrDefault(wr.stemmed, topWords.size()));
|
|
||||||
|
|
||||||
var ret = new ArrayList<>(keywords);
|
|
||||||
ret.sort(comp);
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private static final Pattern separator = Pattern.compile("_");
|
||||||
@ -86,7 +54,11 @@ public class KeywordCounter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
double value(String key, double value) {
|
double value(String key, double value) {
|
||||||
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
|
double freq = dict.getTermFreqStemmed(key);
|
||||||
|
if (freq < 1) {
|
||||||
|
freq = 10;
|
||||||
|
}
|
||||||
|
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ public class LongNameCounter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
double value(String key, double value) {
|
double value(String key, double value) {
|
||||||
return (1+Math.log(value)) * Math.log((1.+dict.getTermFreq(key))/11820118.);
|
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,7 +5,9 @@ import nu.marginalia.util.language.processing.model.WordRep;
|
|||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class SubjectCounter {
|
public class SubjectCounter {
|
||||||
@ -15,6 +17,14 @@ public class SubjectCounter {
|
|||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Seeks out subjects in a sentence by constructs like
|
||||||
|
//
|
||||||
|
// [Name] (Verbs) (the|a|Adverb|Verb) ...
|
||||||
|
// e.g.
|
||||||
|
//
|
||||||
|
// Greeks bearing gifts -> Greeks
|
||||||
|
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld) {
|
public List<WordRep> count(DocumentLanguageData dld) {
|
||||||
|
|
||||||
Map<WordRep, Integer> counts = new HashMap<>();
|
Map<WordRep, Integer> counts = new HashMap<>();
|
||||||
@ -27,9 +37,10 @@ public class SubjectCounter {
|
|||||||
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
|| sentence.separators[kw.end + 1] == WordSeparator.COMMA)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
|
String nextTag = sentence.posTags[kw.end];
|
||||||
&& ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
|
String nextNextTag = sentence.posTags[kw.end+1];
|
||||||
) {
|
|
||||||
|
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
||||||
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
|
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -43,4 +54,16 @@ public class SubjectCounter {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||||
|
return "DT".equals(posTag) // determinant
|
||||||
|
|| "RB".equals(posTag) // adverb
|
||||||
|
|| posTag.startsWith("VB") // verb
|
||||||
|
|| posTag.startsWith("JJ"); // adjective
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isVerb(String posTag) {
|
||||||
|
return posTag.startsWith("VB")
|
||||||
|
&& !posTag.equals("VB"); // not interested in the infinitive
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -11,7 +11,6 @@ public enum IndexBlock {
|
|||||||
Meta(7, 7),
|
Meta(7, 7),
|
||||||
PositionWords(8, 4.5),
|
PositionWords(8, 4.5),
|
||||||
NamesWords(9, 5),
|
NamesWords(9, 5),
|
||||||
TermFreq(10, 10),
|
|
||||||
Topic(11, 0.5);
|
Topic(11, 0.5);
|
||||||
|
|
||||||
public final int id;
|
public final int id;
|
||||||
|
@ -256,8 +256,8 @@ public class ZIMReader {
|
|||||||
try {
|
try {
|
||||||
getArticleData(consumer, pos, blobs);
|
getArticleData(consumer, pos, blobs);
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -384,7 +384,12 @@ public class ZIMReader {
|
|||||||
rb = is.read(data, trb, data.length - trb);
|
rb = is.read(data, trb, data.length - trb);
|
||||||
trb += rb;
|
trb += rb;
|
||||||
}
|
}
|
||||||
consumer.accept(blobToUrl.get(blobNumber), new String(data));
|
try {
|
||||||
|
consumer.accept(blobToUrl.get(blobNumber), new String(data));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
System.out.println(clusterNumber + " " + blobToUrl.size());
|
System.out.println(clusterNumber + " " + blobToUrl.size());
|
||||||
|
Loading…
Reference in New Issue
Block a user