Refactor DocumentKeyword-related classes

This commit is contained in:
Viktor Lofgren 2023-03-09 20:41:38 +01:00
parent efb46cc703
commit 2bc212d65c
16 changed files with 259 additions and 270 deletions

View File

@ -24,6 +24,7 @@ dependencies {
implementation libs.notnull
implementation libs.trove
implementation libs.fastutil
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit

View File

@ -9,11 +9,6 @@ public record DocumentKeywords(
String[] keywords,
long[] metadata) {
DocumentKeywords(DocumentKeywordsBuilder words) {
this(words.words.toArray(String[]::new),
words.metadata.toArray());
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();

View File

@ -1,17 +1,15 @@
package nu.marginalia.converting.model;
import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import java.util.*;
import java.util.function.UnaryOperator;
@ToString @Getter
public class DocumentKeywordsBuilder {
public final ArrayList<String> words = new ArrayList<>();
public final TLongArrayList metadata = new TLongArrayList();
public final Object2LongLinkedOpenHashMap<String> words;
// |------64 letters is this long-------------------------------|
// granted, some of these words are word n-grams, but 64 ought to
@ -23,59 +21,61 @@ public class DocumentKeywordsBuilder {
}
public DocumentKeywords build() {
return new DocumentKeywords(this);
final String[] wordArray = new String[words.size()];
final long[] meta = new long[words.size()];
var iter = words.object2LongEntrySet().fastIterator();
for (int i = 0; iter.hasNext(); i++) {
var entry = iter.next();
meta[i] = entry.getLongValue();
wordArray[i] = entry.getKey();
}
return new DocumentKeywords(wordArray, meta);
}
public DocumentKeywordsBuilder(int cacpacity) {
words.ensureCapacity(cacpacity);
metadata.ensureCapacity(cacpacity);
words = new Object2LongLinkedOpenHashMap<>(cacpacity);
}
public void add(String word, long meta) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.add(word);
metadata.add(meta);
words.put(word, meta);
}
public void addJustNoMeta(String word) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.add(word);
metadata.add(0);
words.putIfAbsent(word, 0);
}
public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set<String> flagWords) {
if (flagWords.isEmpty())
return;
for (int i = 0; i < words.size(); i++) {
if (flagWords.contains(words.get(i))) {
metadata.set(i, metadata.get(i) | flag.asBit());
}
}
flagWords.forEach(word ->
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
);
}
public void addAllSyntheticTerms(Collection<String> newWords) {
words.ensureCapacity(words.size() + newWords.size());
metadata.ensureCapacity(metadata.size() + newWords.size());
long meta = EdgePageWordFlags.Synthetic.asBit();
for (var entry : newWords) {
words.add(entry);
metadata.add(meta);
}
newWords.forEach(word -> {
words.putIfAbsent(word, meta);
});
}
public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>();
for (int i = 0; i < words.size(); i++) {
if ((metadata.get(i) & flags) > 0) {
ret.add(words.get(i));
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) {
var entry = iter.next();
if ((flags & entry.getLongValue()) != 0) {
ret.add(entry.getKey());
}
}
@ -86,8 +86,4 @@ public class DocumentKeywordsBuilder {
return words.size();
}
public void internalize(UnaryOperator<String> internalizer) {
words.replaceAll(internalizer);
}
}

View File

@ -46,26 +46,15 @@ public class DomainProcessor {
fixBadCanonicalTags(crawledDomain.doc);
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
for (var doc : crawledDomain.doc) {
var processedDoc = documentProcessor.process(doc, crawledDomain);
if (processedDoc.words != null) {
// The word data is extremely redundant, and may encompass something like
// 5,000,000 words per domain (and multiple domains are processed at the same time).
processedDoc.words.internalize(stringPool::internalize);
}
if (processedDoc.url != null) {
ret.documents.add(processedDoc);
}
}
stringPool.flush();
documentDeduplicator.deduplicate(ret.documents);
InternalLinkGraph internalLinkGraph = new InternalLinkGraph();

View File

@ -1,19 +1,17 @@
package nu.marginalia.converting.processor.keywords;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.converting.processor.keywords.extractors.*;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.statistics.TermFrequencyDict;
import javax.inject.Inject;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class DocumentKeywordExtractor {
@ -22,165 +20,56 @@ public class DocumentKeywordExtractor {
private final KeywordCounter tfIdfCounter;
private final NameCounter nameCounter;
private final SubjectCounter subjectCounter;
private final ArtifactKeywords artifactKeywords;
private final SimpleKeywords simpleKeywords;
private final DocumentKeywordPositionBitmaskExtractor keywordPositions;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
keywordExtractor = new KeywordExtractor();
keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor);
artifactKeywords = new ArtifactKeywords();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
nameCounter = new NameCounter(keywordExtractor);
subjectCounter = new SubjectCounter(keywordExtractor);
simpleKeywords = new SimpleKeywords(keywordExtractor);
}
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) {
KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.updateWordStatistics(keywordMetadata, documentLanguageData);
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
getWordPositions(keywordMetadata, documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
List<String> artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
List<String> artifacts = getArtifacts(documentLanguageData);
DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder();
FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder();
simpleKeywords.getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
createWords(wordsBuilder, keywordMetadata, titleWords, 0);
createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit());
createWords(wordsBuilder, keywordMetadata, subjects, 0);
createWords(wordsBuilder, keywordMetadata, wordsTfIdf);
createWords(wordsBuilder, keywordMetadata, titleWords);
createWords(wordsBuilder, keywordMetadata, subjects);
getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData);
wordsBuilder.addAllSyntheticTerms(artifacts);
artifacts.forEach(wordsBuilder::addWithBlankMetadata);
return wordsBuilder.build();
return wordsBuilder;
}
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
for (var sent : dld.titleSentences) {
int posBit = 1;
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
int pos = 1;
int line = 0;
for (var sent : dld.sentences) {
int posBit = (int)((1L << pos) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
if (pos < 4) pos ++;
else if (pos < 8) {
if (++line >= 2) {
pos++;
line = 0;
}
}
else if (pos < 24) {
if (++line >= 4) {
pos++;
line = 0;
}
}
else if (pos < 64) {
if (++line > 8) {
pos++;
line = 0;
}
}
else {
break;
}
}
}
private int bitwiseOr(int a, int b) {
return a | b;
}
private void getSimpleWords(FilteringDocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
}
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
}
}
}
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
if (lc.length() > 6
&& lc.indexOf('@') > 0
&& mailLikePattern.matcher(lc).matches()) {
reps.add(lc);
String domain = lc.substring(lc.indexOf('@'));
String user = lc.substring(0, lc.indexOf('@'));
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
reps.add(domain);
}
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
reps.add(user);
}
}
}
}
return new ArrayList<>(reps);
}
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent ->
keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w)))
@ -188,43 +77,18 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList());
}
public void createWords(FilteringDocumentKeywordsBuilder wordsBuilder,
public void createWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
Collection<WordRep> words,
long additionalMeta) {
Collection<WordRep> words) {
for (var word : words) {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
if (!WordPatterns.hasWordQualities(flatWord)) {
continue;
}
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
if (WordPatterns.hasWordQualities(flatWord)) {
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed));
}
}
}
private static class FilteringDocumentKeywordsBuilder {
private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder();
private final Set<String> seen = new HashSet<>(1600);
public void add(String word, long meta) {
if (seen.add(word)) {
words.add(word, meta);
}
}
public void addWithBlankMetadata(String word) {
if (seen.add(word)) {
words.addJustNoMeta(word);
}
}
public DocumentKeywordsBuilder build() {
return words;
}
public int size() {
return seen.size();
}
}
}

View File

@ -0,0 +1,45 @@
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.language.model.DocumentLanguageData;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
public class ArtifactKeywords {
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
public List<String> getArtifactKeywords(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
if (lc.length() < 6
|| lc.indexOf('@') < 0
|| !mailLikePattern.matcher(lc).matches()) {
continue;
}
reps.add(lc);
String domain = lc.substring(lc.indexOf('@'));
String user = lc.substring(0, lc.indexOf('@'));
if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) {
reps.add(domain);
}
if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) {
reps.add(user);
}
}
}
return new ArrayList<>(reps);
}
}

View File

@ -0,0 +1,90 @@
package nu.marginalia.converting.processor.keywords.extractors;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
/** Generates a position bitmask for each word in a document */
public class DocumentKeywordPositionBitmaskExtractor {
private final KeywordExtractor keywordExtractor;
@Inject
public DocumentKeywordPositionBitmaskExtractor(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public KeywordMetadata getWordPositions(DocumentLanguageData dld) {
final KeywordMetadata keywordMetadata = new KeywordMetadata();
Object2IntOpenHashMap<String> ret = keywordMetadata.positionMask();
// Mark the title words as position 0
for (var sent : dld.titleSentences) {
int posBit = 1;
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
LinePosition linePos = new LinePosition();
for (var sent : dld.sentences) {
int posBit = (int)((1L << linePos.pos()) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
linePos.next();
}
return keywordMetadata;
}
private int bitwiseOr(int a, int b) {
return a | b;
}
private static class LinePosition {
private int line = 0;
private int pos = 1;
public int pos() {
return pos;
}
public void next() {
if (pos < 4) pos ++;
else if (pos < 8) {
if (++line >= 2) {
pos++;
line = 0;
}
}
else if (pos < 24) {
if (++line >= 4) {
pos++;
line = 0;
}
}
else if (pos < 64) {
if (++line > 8) {
pos++;
line = 0;
}
}
}
}
}

View File

@ -1,10 +1,9 @@
package nu.marginalia.converting.processor.keywords;
package nu.marginalia.converting.processor.keywords.extractors;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordFrequencyData;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.statistics.TermFrequencyDict;
@ -28,7 +27,7 @@ public class KeywordCounter {
this.docCount = dict.docCount();
}
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
public List<WordRep> updateWordStatistics(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Object2IntOpenHashMap<String> counts = new Object2IntOpenHashMap<>(10_000, 0.7f);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.keywords;
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;

View File

@ -0,0 +1,54 @@
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.converting.model.DocumentKeywordsBuilder;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.language.encoding.AsciiFlattener;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;
import nu.marginalia.language.model.WordRep;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import java.util.EnumSet;
public class SimpleKeywords {
private final KeywordExtractor keywordExtractor;
public SimpleKeywords(KeywordExtractor keywordExtractor) {
this.keywordExtractor = keywordExtractor;
}
public void getSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata,
DocumentLanguageData documentLanguageData) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) {
if (word.isStopWord()) {
continue;
}
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
}
}
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
}
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.converting.processor.keywords;
package nu.marginalia.converting.processor.keywords.extractors;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.KeywordMetadata;

View File

@ -115,8 +115,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
KeywordMetadata keywordMetadata = new KeywordMetadata();
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
ret.description = getDescription(doc);
ret.hashCode = dld.localitySensitiveHashCode();
@ -124,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true);
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class));
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)

View File

@ -91,8 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText));
KeywordMetadata keywordMetadata = new KeywordMetadata();
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata);
DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld);
new MetaTagsBuilder()
.addDomainCrawlData(crawledDomain)

View File

@ -1,8 +1,8 @@
package nu.marginalia.converting.tool;
import nu.marginalia.LanguageModels;
import nu.marginalia.converting.processor.keywords.KeywordCounter;
import nu.marginalia.converting.processor.keywords.NameCounter;
import nu.marginalia.converting.processor.keywords.extractors.KeywordCounter;
import nu.marginalia.converting.processor.keywords.extractors.NameCounter;
import nu.marginalia.language.keywords.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.language.model.DocumentSentence;

View File

@ -60,7 +60,7 @@ class SentenceExtractorTest {
var doc = Jsoup.parse(Files.readString(file.toPath()));
long start = System.currentTimeMillis();
var dld = se.extractSentences(doc);
documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata());
documentKeywordExtractor.extractKeywords(dld);
total += (System.currentTimeMillis() - start);
}
System.out.println(total);
@ -114,40 +114,6 @@ class SentenceExtractorTest {
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
}
@Test
void extractSentences() throws IOException {
var data = WmsaHome.getHomePath().resolve("test-data/");
System.out.println("Running");
var dict = new TermFrequencyDict(lm);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
long st = System.currentTimeMillis();
for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata());
var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.limit(100)
.map(Pair::getKey)
.toArray(String[]::new);
System.out.println(Arrays.toString(terms));
var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i))))
.sorted(Comparator.comparing(e -> -e.getValue().tfIdf()))
.filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects))
.limit(100)
.map(Pair::getKey)
.toArray(String[]::new);
System.out.println(Arrays.toString(terms2));
System.out.println("--");
}
System.out.println(System.currentTimeMillis() - st);
}
@SneakyThrows
@Test
@Disabled
@ -156,16 +122,7 @@ class SentenceExtractorTest {
Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
var dict = new TermFrequencyDict(lm);
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata()));
//
// var pke = new PositionKeywordExtractor(dict, new KeywordExtractor());
// pke.count(result).stream().map(wr -> wr.word).distinct().forEach(System.out::println);
// for (var sent : result.sentences) {
// System.out.println(sent);
// }
System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result));
}
@Test

View File

@ -5,7 +5,6 @@ import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Objects;
@ -34,6 +33,9 @@ public final class KeywordMetadata {
int tfidf = wordsTfIdf.getOrDefault(stemmed, 0);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
if (tfidf > 100)
flags.add(EdgePageWordFlags.TfIdfHigh);
if (subjectKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.Subjects);