mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Refactor DocumentKeyword-related classes
This commit is contained in:
parent
efb46cc703
commit
2bc212d65c
@ -24,6 +24,7 @@ dependencies {
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
|
@ -9,11 +9,6 @@ public record DocumentKeywords(
|
||||
String[] keywords,
|
||||
long[] metadata) {
|
||||
|
||||
DocumentKeywords(DocumentKeywordsBuilder words) {
|
||||
this(words.words.toArray(String[]::new),
|
||||
words.metadata.toArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -1,17 +1,15 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.UnaryOperator;
|
||||
|
||||
@ToString @Getter
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final ArrayList<String> words = new ArrayList<>();
|
||||
public final TLongArrayList metadata = new TLongArrayList();
|
||||
public final Object2LongLinkedOpenHashMap<String> words;
|
||||
|
||||
// |------64 letters is this long-------------------------------|
|
||||
// granted, some of these words are word n-grams, but 64 ought to
|
||||
@ -23,59 +21,61 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
public DocumentKeywords build() {
|
||||
return new DocumentKeywords(this);
|
||||
final String[] wordArray = new String[words.size()];
|
||||
final long[] meta = new long[words.size()];
|
||||
|
||||
var iter = words.object2LongEntrySet().fastIterator();
|
||||
|
||||
for (int i = 0; iter.hasNext(); i++) {
|
||||
var entry = iter.next();
|
||||
|
||||
meta[i] = entry.getLongValue();
|
||||
wordArray[i] = entry.getKey();
|
||||
}
|
||||
|
||||
return new DocumentKeywords(wordArray, meta);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int cacpacity) {
|
||||
words.ensureCapacity(cacpacity);
|
||||
metadata.ensureCapacity(cacpacity);
|
||||
words = new Object2LongLinkedOpenHashMap<>(cacpacity);
|
||||
}
|
||||
|
||||
public void add(String word, long meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.add(word);
|
||||
metadata.add(meta);
|
||||
words.put(word, meta);
|
||||
}
|
||||
|
||||
public void addJustNoMeta(String word) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.add(word);
|
||||
metadata.add(0);
|
||||
words.putIfAbsent(word, 0);
|
||||
}
|
||||
|
||||
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
|
||||
if (flagWords.isEmpty())
|
||||
return;
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
if (flagWords.contains(words.get(i))) {
|
||||
metadata.set(i, metadata.get(i) | flag.asBit());
|
||||
}
|
||||
}
|
||||
flagWords.forEach(word ->
|
||||
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
||||
);
|
||||
}
|
||||
|
||||
public void addAllSyntheticTerms(Collection<String> newWords) {
|
||||
words.ensureCapacity(words.size() + newWords.size());
|
||||
metadata.ensureCapacity(metadata.size() + newWords.size());
|
||||
|
||||
long meta = EdgePageWordFlags.Synthetic.asBit();
|
||||
|
||||
for (var entry : newWords) {
|
||||
words.add(entry);
|
||||
metadata.add(meta);
|
||||
}
|
||||
newWords.forEach(word -> {
|
||||
words.putIfAbsent(word, meta);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
if ((metadata.get(i) & flags) > 0) {
|
||||
ret.add(words.get(i));
|
||||
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) {
|
||||
var entry = iter.next();
|
||||
if ((flags & entry.getLongValue()) != 0) {
|
||||
ret.add(entry.getKey());
|
||||
}
|
||||
}
|
||||
|
||||
@ -86,8 +86,4 @@ public class DocumentKeywordsBuilder {
|
||||
return words.size();
|
||||
}
|
||||
|
||||
public void internalize(UnaryOperator<String> internalizer) {
|
||||
words.replaceAll(internalizer);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -46,26 +46,15 @@ public class DomainProcessor {
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
|
||||
if (processedDoc.words != null) {
|
||||
// The word data is extremely redundant, and may encompass something like
|
||||
// 5,000,000 words per domain (and multiple domains are processed at the same time).
|
||||
|
||||
processedDoc.words.internalize(stringPool::internalize);
|
||||
}
|
||||
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
stringPool.flush();
|
||||
|
||||
documentDeduplicator.deduplicate(ret.documents);
|
||||
|
||||
InternalLinkGraph internalLinkGraph = new InternalLinkGraph();
|
||||
|
@ -1,19 +1,17 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.converting.processor.keywords.extractors.*;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
@ -22,165 +20,56 @@ public class DocumentKeywordExtractor {
|
||||
private final KeywordCounter tfIdfCounter;
|
||||
private final NameCounter nameCounter;
|
||||
private final SubjectCounter subjectCounter;
|
||||
private final ArtifactKeywords artifactKeywords;
|
||||
|
||||
private final SimpleKeywords simpleKeywords;
|
||||
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
keywordExtractor = new KeywordExtractor();
|
||||
|
||||
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
|
||||
artifactKeywords = new ArtifactKeywords();
|
||||
|
||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||
nameCounter = new NameCounter(keywordExtractor);
|
||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||
simpleKeywords = new SimpleKeywords(keywordExtractor);
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||
|
||||
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.updateWordStatistics(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
getWordPositions(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
|
||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
|
||||
|
||||
FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
|
||||
simpleKeywords.getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
|
||||
|
||||
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
|
||||
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
|
||||
createWords(wordsBuilder, keywordMetadata, subjects, 0);
|
||||
createWords(wordsBuilder, keywordMetadata, wordsTfIdf);
|
||||
createWords(wordsBuilder, keywordMetadata, titleWords);
|
||||
createWords(wordsBuilder, keywordMetadata, subjects);
|
||||
|
||||
getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
|
||||
wordsBuilder.addAllSyntheticTerms(artifacts);
|
||||
|
||||
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
|
||||
|
||||
return wordsBuilder.build();
|
||||
return wordsBuilder;
|
||||
}
|
||||
|
||||
|
||||
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
|
||||
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
|
||||
int pos = 1;
|
||||
int line = 0;
|
||||
for (var sent : dld.sentences) {
|
||||
int posBit = (int)((1L << pos) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
if (pos < 4) pos ++;
|
||||
else if (pos < 8) {
|
||||
if (++line >= 2) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 24) {
|
||||
if (++line >= 4) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 64) {
|
||||
if (++line > 8) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int bitwiseOr(int a, int b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
|
||||
private void getSimpleWords(FilteringDocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
|
||||
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
if (lc.length() > 6
|
||||
&& lc.indexOf('@') > 0
|
||||
&& mailLikePattern.matcher(lc).matches()) {
|
||||
|
||||
reps.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
String user = lc.substring(0, lc.indexOf('@'));
|
||||
|
||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
||||
reps.add(domain);
|
||||
}
|
||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
||||
reps.add(user);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
return new ArrayList<>(reps);
|
||||
}
|
||||
|
||||
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
||||
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
|
||||
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
|
||||
@ -188,43 +77,18 @@ public class DocumentKeywordExtractor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public void createWords(FilteringDocumentKeywordsBuilder wordsBuilder,
|
||||
public void createWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
Collection<WordRep> words,
|
||||
long additionalMeta) {
|
||||
Collection<WordRep> words) {
|
||||
|
||||
for (var word : words) {
|
||||
|
||||
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||
if (!WordPatterns.hasWordQualities(flatWord)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||
if (WordPatterns.hasWordQualities(flatWord)) {
|
||||
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class FilteringDocumentKeywordsBuilder {
|
||||
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
|
||||
private final Set<String> seen = new HashSet<>(1600);
|
||||
|
||||
public void add(String word, long meta) {
|
||||
if (seen.add(word)) {
|
||||
words.add(word, meta);
|
||||
}
|
||||
}
|
||||
public void addWithBlankMetadata(String word) {
|
||||
if (seen.add(word)) {
|
||||
words.addJustNoMeta(word);
|
||||
}
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder build() {
|
||||
return words;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return seen.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,45 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class ArtifactKeywords {
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
|
||||
public List<String> getArtifactKeywords(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
if (lc.length() < 6
|
||||
|| lc.indexOf('@') < 0
|
||||
|| !mailLikePattern.matcher(lc).matches()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
reps.add(lc);
|
||||
|
||||
String domain = lc.substring(lc.indexOf('@'));
|
||||
String user = lc.substring(0, lc.indexOf('@'));
|
||||
|
||||
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
|
||||
reps.add(domain);
|
||||
}
|
||||
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
|
||||
reps.add(user);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return new ArrayList<>(reps);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
||||
public class DocumentKeywordPositionBitmaskExtractor {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordPositionBitmaskExtractor(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
|
||||
final KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
|
||||
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
|
||||
LinePosition linePos = new LinePosition();
|
||||
for (var sent : dld.sentences) {
|
||||
|
||||
int posBit = (int)((1L << linePos.pos()) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
linePos.next();
|
||||
}
|
||||
|
||||
return keywordMetadata;
|
||||
}
|
||||
|
||||
private int bitwiseOr(int a, int b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
private static class LinePosition {
|
||||
private int line = 0;
|
||||
private int pos = 1;
|
||||
|
||||
public int pos() {
|
||||
return pos;
|
||||
}
|
||||
|
||||
public void next() {
|
||||
if (pos < 4) pos ++;
|
||||
else if (pos < 8) {
|
||||
if (++line >= 2) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 24) {
|
||||
if (++line >= 4) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
else if (pos < 64) {
|
||||
if (++line > 8) {
|
||||
pos++;
|
||||
line = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordFrequencyData;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
@ -28,7 +27,7 @@ public class KeywordCounter {
|
||||
this.docCount = dict.docCount();
|
||||
}
|
||||
|
||||
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
public List<WordRep> updateWordStatistics(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
@ -0,0 +1,54 @@
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
||||
import nu.marginalia.language.model.WordRep;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
public class SimpleKeywords {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
public SimpleKeywords(KeywordExtractor keywordExtractor) {
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public void getSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData) {
|
||||
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
if (word.isStopWord()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.converting.processor.keywords;
|
||||
package nu.marginalia.converting.processor.keywords.extractors;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.KeywordMetadata;
|
@ -115,8 +115,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
ret.description = getDescription(doc);
|
||||
ret.hashCode = dld.localitySensitiveHashCode();
|
||||
@ -124,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
|
||||
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
||||
|
||||
new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
|
@ -91,8 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
|
||||
|
||||
KeywordMetadata keywordMetadata = new KeywordMetadata();
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
|
||||
|
||||
new MetaTagsBuilder()
|
||||
.addDomainCrawlData(crawledDomain)
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.converting.tool;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.converting.processor.keywords.KeywordCounter;
|
||||
import nu.marginalia.converting.processor.keywords.NameCounter;
|
||||
import nu.marginalia.converting.processor.keywords.extractors.KeywordCounter;
|
||||
import nu.marginalia.converting.processor.keywords.extractors.NameCounter;
|
||||
import nu.marginalia.language.keywords.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
|
@ -60,7 +60,7 @@ class SentenceExtractorTest {
|
||||
var doc = Jsoup.parse(Files.readString(file.toPath()));
|
||||
long start = System.currentTimeMillis();
|
||||
var dld = se.extractSentences(doc);
|
||||
documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata());
|
||||
documentKeywordExtractor.extractKeywords(dld);
|
||||
total += (System.currentTimeMillis() - start);
|
||||
}
|
||||
System.out.println(total);
|
||||
@ -114,40 +114,6 @@ class SentenceExtractorTest {
|
||||
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSentences() throws IOException {
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
|
||||
System.out.println("Running");
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
long st = System.currentTimeMillis();
|
||||
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
|
||||
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
|
||||
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
|
||||
|
||||
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
|
||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||
.limit(100)
|
||||
.map(Pair::getKey)
|
||||
.toArray(String[]::new);
|
||||
System.out.println(Arrays.toString(terms));
|
||||
|
||||
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
|
||||
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
|
||||
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
|
||||
.limit(100)
|
||||
.map(Pair::getKey)
|
||||
.toArray(String[]::new);
|
||||
System.out.println(Arrays.toString(terms2));
|
||||
System.out.println("--");
|
||||
}
|
||||
System.out.println(System.currentTimeMillis() - st);
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Test
|
||||
@Disabled
|
||||
@ -156,16 +122,7 @@ class SentenceExtractorTest {
|
||||
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata()));
|
||||
|
||||
|
||||
//
|
||||
// var pke = new PositionKeywordExtractor(dict, new KeywordExtractor());
|
||||
// pke.count(result).stream().map(wr -> wr.word).distinct().forEach(System.out::println);
|
||||
// for (var sent : result.sentences) {
|
||||
// System.out.println(sent);
|
||||
// }
|
||||
|
||||
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -5,7 +5,6 @@ import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
|
||||
@ -34,6 +33,9 @@ public final class KeywordMetadata {
|
||||
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
|
||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||
|
||||
if (tfidf > 100)
|
||||
flags.add(EdgePageWordFlags.TfIdfHigh);
|
||||
|
||||
if (subjectKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.Subjects);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user