diff --git a/code/crawl/converting-model/build.gradle b/code/crawl/converting-model/build.gradle index 99ca5c95..24678acb 100644 --- a/code/crawl/converting-model/build.gradle +++ b/code/crawl/converting-model/build.gradle @@ -24,6 +24,7 @@ dependencies { implementation libs.notnull implementation libs.trove + implementation libs.fastutil testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywords.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywords.java index 8ea74606..61ff5e02 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywords.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywords.java @@ -9,11 +9,6 @@ public record DocumentKeywords( String[] keywords, long[] metadata) { - DocumentKeywords(DocumentKeywordsBuilder words) { - this(words.words.toArray(String[]::new), - words.metadata.toArray()); - } - @Override public String toString() { StringBuilder sb = new StringBuilder(); diff --git a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java index a7eee4d7..fc8dcfea 100644 --- a/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java +++ b/code/crawl/converting-model/src/main/java/nu/marginalia/converting/model/DocumentKeywordsBuilder.java @@ -1,17 +1,15 @@ package nu.marginalia.converting.model; -import gnu.trove.list.array.TLongArrayList; +import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import lombok.Getter; import lombok.ToString; import nu.marginalia.model.crawl.EdgePageWordFlags; import java.util.*; -import java.util.function.UnaryOperator; @ToString @Getter public class DocumentKeywordsBuilder { - public final ArrayList words = new ArrayList<>(); - public final TLongArrayList metadata = new TLongArrayList(); + public final Object2LongLinkedOpenHashMap words; // |------64 letters is this long-------------------------------| // granted, some of these words are word n-grams, but 64 ought to @@ -23,59 +21,61 @@ public class DocumentKeywordsBuilder { } public DocumentKeywords build() { - return new DocumentKeywords(this); + final String[] wordArray = new String[words.size()]; + final long[] meta = new long[words.size()]; + + var iter = words.object2LongEntrySet().fastIterator(); + + for (int i = 0; iter.hasNext(); i++) { + var entry = iter.next(); + + meta[i] = entry.getLongValue(); + wordArray[i] = entry.getKey(); + } + + return new DocumentKeywords(wordArray, meta); } public DocumentKeywordsBuilder(int cacpacity) { - words.ensureCapacity(cacpacity); - metadata.ensureCapacity(cacpacity); + words = new Object2LongLinkedOpenHashMap<>(cacpacity); } public void add(String word, long meta) { if (word.length() > MAX_WORD_LENGTH) return; - words.add(word); - metadata.add(meta); + words.put(word, meta); } public void addJustNoMeta(String word) { if (word.length() > MAX_WORD_LENGTH) return; - words.add(word); - metadata.add(0); + words.putIfAbsent(word, 0); } public void setFlagOnMetadataForWords(EdgePageWordFlags flag, Set flagWords) { - if (flagWords.isEmpty()) - return; - - for (int i = 0; i < words.size(); i++) { - if (flagWords.contains(words.get(i))) { - metadata.set(i, metadata.get(i) | flag.asBit()); - } - } + flagWords.forEach(word -> + words.mergeLong(word, flag.asBit(), (a, b) -> a|b) + ); } public void addAllSyntheticTerms(Collection newWords) { - words.ensureCapacity(words.size() + newWords.size()); - metadata.ensureCapacity(metadata.size() + newWords.size()); - long meta = EdgePageWordFlags.Synthetic.asBit(); - for (var entry : newWords) { - words.add(entry); - metadata.add(meta); - } + newWords.forEach(word -> { + words.putIfAbsent(word, meta); + }); + } public List getWordsWithAnyFlag(long flags) { List ret = new ArrayList<>(); - for (int i = 0; i < words.size(); i++) { - if ((metadata.get(i) & flags) > 0) { - ret.add(words.get(i)); + for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) { + var entry = iter.next(); + if ((flags & entry.getLongValue()) != 0) { + ret.add(entry.getKey()); } } @@ -86,8 +86,4 @@ public class DocumentKeywordsBuilder { return words.size(); } - public void internalize(UnaryOperator internalizer) { - words.replaceAll(internalizer); - } - } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index c25d3440..c3c32130 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -46,26 +46,15 @@ public class DomainProcessor { fixBadCanonicalTags(crawledDomain.doc); - StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size()); - for (var doc : crawledDomain.doc) { var processedDoc = documentProcessor.process(doc, crawledDomain); - if (processedDoc.words != null) { - // The word data is extremely redundant, and may encompass something like - // 5,000,000 words per domain (and multiple domains are processed at the same time). - - processedDoc.words.internalize(stringPool::internalize); - } - if (processedDoc.url != null) { ret.documents.add(processedDoc); } } - stringPool.flush(); - documentDeduplicator.deduplicate(ret.documents); InternalLinkGraph internalLinkGraph = new InternalLinkGraph(); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java index cc0bcb78..235d834b 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/DocumentKeywordExtractor.java @@ -1,19 +1,17 @@ package nu.marginalia.converting.processor.keywords; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.converting.processor.keywords.extractors.*; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.language.keywords.KeywordExtractor; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.KeywordMetadata; import nu.marginalia.language.model.WordRep; -import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.converting.model.DocumentKeywordsBuilder; import nu.marginalia.language.statistics.TermFrequencyDict; import javax.inject.Inject; import java.util.*; -import java.util.regex.Pattern; import java.util.stream.Collectors; public class DocumentKeywordExtractor { @@ -22,165 +20,56 @@ public class DocumentKeywordExtractor { private final KeywordCounter tfIdfCounter; private final NameCounter nameCounter; private final SubjectCounter subjectCounter; + private final ArtifactKeywords artifactKeywords; + + private final SimpleKeywords simpleKeywords; + private final DocumentKeywordPositionBitmaskExtractor keywordPositions; @Inject public DocumentKeywordExtractor(TermFrequencyDict dict) { keywordExtractor = new KeywordExtractor(); + keywordPositions = new DocumentKeywordPositionBitmaskExtractor(keywordExtractor); + artifactKeywords = new ArtifactKeywords(); + tfIdfCounter = new KeywordCounter(dict, keywordExtractor); nameCounter = new NameCounter(keywordExtractor); subjectCounter = new SubjectCounter(keywordExtractor); + simpleKeywords = new SimpleKeywords(keywordExtractor); } - public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) { + public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData documentLanguageData) { + + KeywordMetadata keywordMetadata = keywordPositions.getWordPositions(documentLanguageData); + + List wordsTfIdf = tfIdfCounter.updateWordStatistics(keywordMetadata, documentLanguageData); List titleWords = extractTitleWords(documentLanguageData); - - getWordPositions(keywordMetadata, documentLanguageData); - - List wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData); - List wordsNamesAll = nameCounter.count(documentLanguageData, 2); List subjects = subjectCounter.count(keywordMetadata, documentLanguageData); + List artifacts = artifactKeywords.getArtifactKeywords(documentLanguageData); for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed); for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed); for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed); - List artifacts = getArtifacts(documentLanguageData); + DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder(); - FilteringDocumentKeywordsBuilder wordsBuilder = new FilteringDocumentKeywordsBuilder(); + simpleKeywords.getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData); - createWords(wordsBuilder, keywordMetadata, titleWords, 0); - createWords(wordsBuilder, keywordMetadata, wordsTfIdf, EdgePageWordFlags.TfIdfHigh.asBit()); - createWords(wordsBuilder, keywordMetadata, subjects, 0); + createWords(wordsBuilder, keywordMetadata, wordsTfIdf); + createWords(wordsBuilder, keywordMetadata, titleWords); + createWords(wordsBuilder, keywordMetadata, subjects); - getSimpleWords(wordsBuilder, keywordMetadata, documentLanguageData); + wordsBuilder.addAllSyntheticTerms(artifacts); - artifacts.forEach(wordsBuilder::addWithBlankMetadata); - - return wordsBuilder.build(); + return wordsBuilder; } - public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { - Object2IntOpenHashMap ret = keywordMetadata.positionMask(); - - for (var sent : dld.titleSentences) { - int posBit = 1; - - for (var word : sent) { - ret.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - } - - int pos = 1; - int line = 0; - for (var sent : dld.sentences) { - int posBit = (int)((1L << pos) & 0xFFFF_FFFFL); - - for (var word : sent) { - ret.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - if (pos < 4) pos ++; - else if (pos < 8) { - if (++line >= 2) { - pos++; - line = 0; - } - } - else if (pos < 24) { - if (++line >= 4) { - pos++; - line = 0; - } - } - else if (pos < 64) { - if (++line > 8) { - pos++; - line = 0; - } - } - else { - break; - } - } - } - - private int bitwiseOr(int a, int b) { - return a | b; - } - - - private void getSimpleWords(FilteringDocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) { - - EnumSet flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); - - for (var sent : documentLanguageData.sentences) { - - if (wordsBuilder.size() > 1500) - break; - - for (var word : sent) { - if (!word.isStopWord()) { - String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); - if (WordPatterns.singleWordQualitiesPredicate.test(w)) { - wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed())); - } - } - } - - for (var names : keywordExtractor.getProperNames(sent)) { - var rep = new WordRep(sent, names); - String w = AsciiFlattener.flattenUnicode(rep.word); - - wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed)); - } - } - - } - - private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); - private List getArtifacts(DocumentLanguageData documentLanguageData) { - Set reps = new HashSet<>(); - - for (var sent : documentLanguageData.sentences) { - for (var word : sent) { - String lc = word.wordLowerCase(); - if (lc.length() > 6 - && lc.indexOf('@') > 0 - && mailLikePattern.matcher(lc).matches()) { - - reps.add(lc); - - String domain = lc.substring(lc.indexOf('@')); - String user = lc.substring(0, lc.indexOf('@')); - - if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) { - reps.add(domain); - } - if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) { - reps.add(user); - } - - } - } - } - return new ArrayList<>(reps); - } - private List extractTitleWords(DocumentLanguageData documentLanguageData) { return Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) @@ -188,43 +77,18 @@ public class DocumentKeywordExtractor { .collect(Collectors.toList()); } - public void createWords(FilteringDocumentKeywordsBuilder wordsBuilder, + public void createWords(DocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, - Collection words, - long additionalMeta) { + Collection words) { for (var word : words) { String flatWord = AsciiFlattener.flattenUnicode(word.word); - if (!WordPatterns.hasWordQualities(flatWord)) { - continue; - } - wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta); + if (WordPatterns.hasWordQualities(flatWord)) { + wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed)); + } } } - private static class FilteringDocumentKeywordsBuilder { - private final DocumentKeywordsBuilder words = new DocumentKeywordsBuilder(); - private final Set seen = new HashSet<>(1600); - - public void add(String word, long meta) { - if (seen.add(word)) { - words.add(word, meta); - } - } - public void addWithBlankMetadata(String word) { - if (seen.add(word)) { - words.addJustNoMeta(word); - } - } - - public DocumentKeywordsBuilder build() { - return words; - } - - public int size() { - return seen.size(); - } - } } diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/ArtifactKeywords.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/ArtifactKeywords.java new file mode 100644 index 00000000..4687dc2f --- /dev/null +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/ArtifactKeywords.java @@ -0,0 +1,45 @@ +package nu.marginalia.converting.processor.keywords.extractors; + +import nu.marginalia.language.model.DocumentLanguageData; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Pattern; + +public class ArtifactKeywords { + + private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); + + public List getArtifactKeywords(DocumentLanguageData documentLanguageData) { + Set reps = new HashSet<>(); + + for (var sent : documentLanguageData.sentences) { + for (var word : sent) { + String lc = word.wordLowerCase(); + if (lc.length() < 6 + || lc.indexOf('@') < 0 + || !mailLikePattern.matcher(lc).matches()) { + continue; + } + + reps.add(lc); + + String domain = lc.substring(lc.indexOf('@')); + String user = lc.substring(0, lc.indexOf('@')); + + if (!domain.equals("@hotmail.com") && !domain.equals("@gmail.com") && !domain.equals("@paypal.com")) { + reps.add(domain); + } + if (!user.equals("info") && !user.equals("legal") && !user.equals("contact") && !user.equals("donotreply")) { + reps.add(user); + } + + } + } + + return new ArrayList<>(reps); + } + +} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/DocumentKeywordPositionBitmaskExtractor.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/DocumentKeywordPositionBitmaskExtractor.java new file mode 100644 index 00000000..79fe341c --- /dev/null +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/DocumentKeywordPositionBitmaskExtractor.java @@ -0,0 +1,90 @@ +package nu.marginalia.converting.processor.keywords.extractors; + +import com.google.inject.Inject; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.KeywordMetadata; + +/** Generates a position bitmask for each word in a document */ +public class DocumentKeywordPositionBitmaskExtractor { + private final KeywordExtractor keywordExtractor; + + @Inject + public DocumentKeywordPositionBitmaskExtractor(KeywordExtractor keywordExtractor) { + this.keywordExtractor = keywordExtractor; + } + + public KeywordMetadata getWordPositions(DocumentLanguageData dld) { + final KeywordMetadata keywordMetadata = new KeywordMetadata(); + + Object2IntOpenHashMap ret = keywordMetadata.positionMask(); + + // Mark the title words as position 0 + for (var sent : dld.titleSentences) { + int posBit = 1; + + for (var word : sent) { + ret.merge(word.stemmed(), posBit, this::bitwiseOr); + } + + for (var span : keywordExtractor.getProperNames(sent)) { + ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + } + + // Mark subsequent sentences in subsequent positions, with increasing sentence step size + LinePosition linePos = new LinePosition(); + for (var sent : dld.sentences) { + + int posBit = (int)((1L << linePos.pos()) & 0xFFFF_FFFFL); + + for (var word : sent) { + ret.merge(word.stemmed(), posBit, this::bitwiseOr); + } + + for (var span : keywordExtractor.getProperNames(sent)) { + ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); + } + + linePos.next(); + } + + return keywordMetadata; + } + + private int bitwiseOr(int a, int b) { + return a | b; + } + + private static class LinePosition { + private int line = 0; + private int pos = 1; + + public int pos() { + return pos; + } + + public void next() { + if (pos < 4) pos ++; + else if (pos < 8) { + if (++line >= 2) { + pos++; + line = 0; + } + } + else if (pos < 24) { + if (++line >= 4) { + pos++; + line = 0; + } + } + else if (pos < 64) { + if (++line > 8) { + pos++; + line = 0; + } + } + } + } +} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/KeywordCounter.java similarity index 93% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java rename to code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/KeywordCounter.java index 91846a9b..16faa7f0 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/KeywordCounter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/KeywordCounter.java @@ -1,10 +1,9 @@ -package nu.marginalia.converting.processor.keywords; +package nu.marginalia.converting.processor.keywords.extractors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.KeywordMetadata; -import nu.marginalia.language.model.WordFrequencyData; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.keywords.KeywordExtractor; import nu.marginalia.language.statistics.TermFrequencyDict; @@ -28,7 +27,7 @@ public class KeywordCounter { this.docCount = dict.docCount(); } - public List countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { + public List updateWordStatistics(KeywordMetadata keywordMetadata, DocumentLanguageData dld) { Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(10_000, 0.7f); HashMap> instances = new HashMap<>(15000); diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/NameCounter.java similarity index 96% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java rename to code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/NameCounter.java index 22ce88ed..b09c27c0 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/NameCounter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/NameCounter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.keywords; +package nu.marginalia.converting.processor.keywords.extractors; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java new file mode 100644 index 00000000..92658991 --- /dev/null +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SimpleKeywords.java @@ -0,0 +1,54 @@ +package nu.marginalia.converting.processor.keywords.extractors; + +import nu.marginalia.converting.model.DocumentKeywordsBuilder; +import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.encoding.AsciiFlattener; +import nu.marginalia.language.keywords.KeywordExtractor; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.KeywordMetadata; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.model.crawl.EdgePageWordFlags; + +import java.util.EnumSet; + +public class SimpleKeywords { + private final KeywordExtractor keywordExtractor; + + public SimpleKeywords(KeywordExtractor keywordExtractor) { + this.keywordExtractor = keywordExtractor; + } + + public void getSimpleWords(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData documentLanguageData) { + + EnumSet flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class); + + for (var sent : documentLanguageData.sentences) { + + if (wordsBuilder.size() > 1500) + break; + + for (var word : sent) { + if (word.isStopWord()) { + continue; + } + + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (WordPatterns.singleWordQualitiesPredicate.test(w)) { + wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed())); + } + } + + for (var names : keywordExtractor.getProperNames(sent)) { + var rep = new WordRep(sent, names); + String w = AsciiFlattener.flattenUnicode(rep.word); + + wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed)); + } + } + + } + + +} diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SubjectCounter.java similarity index 98% rename from code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java rename to code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SubjectCounter.java index cb77d526..c3deea28 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/SubjectCounter.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/keywords/extractors/SubjectCounter.java @@ -1,4 +1,4 @@ -package nu.marginalia.converting.processor.keywords; +package nu.marginalia.converting.processor.keywords.extractors; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.KeywordMetadata; diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 344ec72f..a685a249 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -115,8 +115,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - KeywordMetadata keywordMetadata = new KeywordMetadata(); - ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); ret.description = getDescription(doc); ret.hashCode = dld.localitySensitiveHashCode(); @@ -124,7 +122,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin PubDate pubDate = pubDateSniffer.getPubDate(crawledDocument.headers, url, doc, ret.standard, true); ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.noneOf(EdgePageDocumentFlags.class)); - DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata); + DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld); new MetaTagsBuilder() .addDomainCrawlData(crawledDomain) diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index ff2c0bb1..7ea1bc22 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -91,8 +91,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP ret.metadata = new DocumentMetadata(url.depth(), pubDate.yearByte(), 0, (int) -ret.quality, EnumSet.of(EdgePageDocumentFlags.PlainText)); - KeywordMetadata keywordMetadata = new KeywordMetadata(); - DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, keywordMetadata); + DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld); new MetaTagsBuilder() .addDomainCrawlData(crawledDomain) diff --git a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java index 0d2fb906..ae1a4ef3 100644 --- a/code/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java +++ b/code/crawl/converting-process/src/main/java/nu/marginalia/converting/tool/DocumentDebugger.java @@ -1,8 +1,8 @@ package nu.marginalia.converting.tool; import nu.marginalia.LanguageModels; -import nu.marginalia.converting.processor.keywords.KeywordCounter; -import nu.marginalia.converting.processor.keywords.NameCounter; +import nu.marginalia.converting.processor.keywords.extractors.KeywordCounter; +import nu.marginalia.converting.processor.keywords.extractors.NameCounter; import nu.marginalia.language.keywords.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.model.DocumentSentence; diff --git a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java index fe6299e7..6777da5f 100644 --- a/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java +++ b/code/crawl/converting-process/src/test/java/nu/marginalia/converting/processor/keywords/SentenceExtractorTest.java @@ -60,7 +60,7 @@ class SentenceExtractorTest { var doc = Jsoup.parse(Files.readString(file.toPath())); long start = System.currentTimeMillis(); var dld = se.extractSentences(doc); - documentKeywordExtractor.extractKeywords(dld, new KeywordMetadata()); + documentKeywordExtractor.extractKeywords(dld); total += (System.currentTimeMillis() - start); } System.out.println(total); @@ -114,40 +114,6 @@ class SentenceExtractorTest { System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches()); } - @Test - void extractSentences() throws IOException { - var data = WmsaHome.getHomePath().resolve("test-data/"); - - System.out.println("Running"); - - var dict = new TermFrequencyDict(lm); - - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); - long st = System.currentTimeMillis(); - for (var file : Objects.requireNonNull(data.toFile().listFiles())) { - var newResult = newSe.extractSentences(Jsoup.parse(Files.readString(file.toPath()))); - var newRes = documentKeywordExtractor.extractKeywords(newResult, new KeywordMetadata()); - - var terms = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i)))) - .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) - .limit(100) - .map(Pair::getKey) - .toArray(String[]::new); - System.out.println(Arrays.toString(terms)); - - var terms2 = IntStream.range(0, newRes.size()).mapToObj(i -> Pair.of(newRes.words.get(i), new WordMetadata(newRes.metadata.get(i)))) - .sorted(Comparator.comparing(e -> -e.getValue().tfIdf())) - .filter(e -> e.getValue().hasFlag(EdgePageWordFlags.Subjects)) - .limit(100) - .map(Pair::getKey) - .toArray(String[]::new); - System.out.println(Arrays.toString(terms2)); - System.out.println("--"); - } - System.out.println(System.currentTimeMillis() - st); - - } - @SneakyThrows @Test @Disabled @@ -156,16 +122,7 @@ class SentenceExtractorTest { Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html")))); var dict = new TermFrequencyDict(lm); - System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new KeywordMetadata())); - - -// -// var pke = new PositionKeywordExtractor(dict, new KeywordExtractor()); -// pke.count(result).stream().map(wr -> wr.word).distinct().forEach(System.out::println); -// for (var sent : result.sentences) { -// System.out.println(sent); -// } - + System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result)); } @Test diff --git a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java index 14ef7268..69d60efa 100644 --- a/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java +++ b/code/libraries/language-processing/src/main/java/nu/marginalia/language/model/KeywordMetadata.java @@ -5,7 +5,6 @@ import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.crawl.EdgePageWordFlags; import java.util.EnumSet; -import java.util.HashMap; import java.util.HashSet; import java.util.Objects; @@ -34,6 +33,9 @@ public final class KeywordMetadata { int tfidf = wordsTfIdf.getOrDefault(stemmed, 0); EnumSet flags = flagsTemplate.clone(); + if (tfidf > 100) + flags.add(EdgePageWordFlags.TfIdfHigh); + if (subjectKeywords.contains(stemmed)) flags.add(EdgePageWordFlags.Subjects);