diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index aaad9800..84395d0f 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,6 +1,6 @@ package nu.marginalia.keyword; -import nu.marginalia.segmentation.NgramLexicon; +import nu.marginalia.WmsaHome; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; @@ -9,27 +9,32 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.model.EdgeUrl; import com.google.inject.Inject; + import java.util.*; import java.util.stream.Stream; + public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; - private final NgramLexicon ngramLexicon; @Inject - public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { + public DocumentKeywordExtractor(TermFrequencyDict dict) { this.dict = dict; - this.ngramLexicon = ngramLexicon; + this.keywordExtractor = new KeywordExtractor(); + } + + // for tests + public DocumentKeywordExtractor() { + this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels()); this.keywordExtractor = new KeywordExtractor(); } public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) { - var bitmask = new KeywordPositionBitmask(keywordExtractor, dld); var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld); var titleKeywords = new TitleKeywords(keywordExtractor, dld); @@ -39,7 +44,6 @@ public class DocumentKeywordExtractor { var urlKeywords = new UrlKeywords(url); var keywordMetadata = KeywordMetadata.builder() - .bitmask(bitmask) .tfIdfCounts(tfIdfCounts) .titleKeywords(titleKeywords) .nameLikeKeywords(nameLikeKeywords) @@ -51,14 +55,14 @@ public class DocumentKeywordExtractor { createSimpleWords(wordsBuilder, keywordMetadata, dld); - createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts); - createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords); - createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); - createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder); - wordsBuilder.addImportantWords(importantWords); + wordsBuilder.addImportantWords(importantWords); wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords()); return wordsBuilder; @@ -77,36 +81,30 @@ public class DocumentKeywordExtractor { .sorted(tfIdfCounts.reversed()) .limit(16) .filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100) - .sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w))) + .sorted(Comparator.comparing(tfIdfCounts::termFrequencyDictValue)) .limit(6) .map(w -> w.word) .toList(); } - private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder, - KeywordMetadata metadata, - WordReps words) { - + private void createNGramTermsFromSet(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + WordReps words) { for (var rep : words.getReps()) { - var word = rep.word; if (!word.isBlank()) { long meta = metadata.getMetadataForWord(rep.stemmed); - - assert meta != 0L : "Missing meta for " + rep.word; - - wordsBuilder.add(word, meta); + wordsBuilder.addMeta(word, meta); } } } - - private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, KeywordMetadata metadata, DocumentLanguageData documentLanguageData) { + int pos = 0; for (var sent : documentLanguageData.sentences) { if (wordsBuilder.size() > 1500) @@ -119,10 +117,11 @@ public class DocumentKeywordExtractor { String w = word.wordLowerCase(); if (matchesWordPattern(w)) { - long meta = metadata.getMetadataForWord(word.stemmed()); - assert meta != 0L : "Missing meta for " + word.word(); + /* Add information about term positions */ + wordsBuilder.addPos(word.wordLowerCase(), pos++); - wordsBuilder.add(w, meta); + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); } } @@ -130,9 +129,8 @@ public class DocumentKeywordExtractor { var rep = new WordRep(sent, names); long meta = metadata.getMetadataForWord(rep.stemmed); - assert meta != 0L : "Missing meta for " + rep.word; - wordsBuilder.add(rep.word, meta); + wordsBuilder.addMeta(rep.word, meta); } for (int i = 0; i < sent.ngrams.length; i++) { @@ -140,9 +138,8 @@ public class DocumentKeywordExtractor { var ngramStemmed = sent.ngramStemmed[i]; long meta = metadata.getMetadataForWord(ngramStemmed); - assert meta != 0L : "Missing meta for " + ngram; - wordsBuilder.add(ngram, meta); + wordsBuilder.addMeta(ngram, meta); } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java index 7160eb04..4394936b 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -2,14 +2,10 @@ package nu.marginalia.keyword; import lombok.Builder; import nu.marginalia.keyword.extractors.*; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordFlags; -import java.util.EnumSet; - class KeywordMetadata { - private final KeywordPositionBitmask bitmask; private final TitleKeywords titleKeywords; private final NameLikeKeywords nameLikeKeywords; private final SubjectLikeKeywords subjectLikeKeywords; @@ -18,14 +14,12 @@ class KeywordMetadata { @Builder public KeywordMetadata( - KeywordPositionBitmask bitmask, TitleKeywords titleKeywords, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, UrlKeywords urlKeywords, - WordsTfIdfCounts tfIdfCounts) { - - this.bitmask = bitmask; + WordsTfIdfCounts tfIdfCounts) + { this.titleKeywords = titleKeywords; this.nameLikeKeywords = nameLikeKeywords; this.subjectLikeKeywords = subjectLikeKeywords; @@ -36,29 +30,33 @@ class KeywordMetadata { public long getMetadataForWord(String stemmed) { int tfidf = tfIdfCounts.getTfIdf(stemmed); - EnumSet flags = EnumSet.noneOf(WordFlags.class); + long flags = 0; - if (tfidf > 100) - flags.add(WordFlags.TfIdfHigh); + if (tfidf > 100) { + flags |= WordFlags.TfIdfHigh.asBit(); + } - if (subjectLikeKeywords.contains(stemmed)) - flags.add(WordFlags.Subjects); + if (subjectLikeKeywords.contains(stemmed)) { + flags |= WordFlags.Subjects.asBit(); + } - if (nameLikeKeywords.contains(stemmed)) - flags.add(WordFlags.NamesWords); + if (nameLikeKeywords.contains(stemmed)) { + flags |= WordFlags.NamesWords.asBit(); + } - if (titleKeywords.contains(stemmed)) - flags.add(WordFlags.Title); + if (titleKeywords.contains(stemmed)) { + flags |= WordFlags.Title.asBit(); + } - if (urlKeywords.containsUrl(stemmed)) - flags.add(WordFlags.UrlPath); + if (urlKeywords.containsUrl(stemmed)) { + flags |= WordFlags.UrlPath.asBit(); + } - if (urlKeywords.containsDomain(stemmed)) - flags.add(WordFlags.UrlDomain); + if (urlKeywords.containsDomain(stemmed)) { + flags |= WordFlags.UrlDomain.asBit(); + } - long positions = bitmask.get(stemmed); - - return new WordMetadata(positions, flags).encode(); + return flags; } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java deleted file mode 100644 index 230c895f..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.keyword.extractors; - -import com.google.inject.Inject; -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import nu.marginalia.keyword.KeywordExtractor; -import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.model.idx.WordMetadata; - -/** Generates a position bitmask for each word in a document */ -public class KeywordPositionBitmask { - private final Object2LongOpenHashMap positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f); - private final static int positionWidth = WordMetadata.POSITIONS_COUNT; - private final static long positionBitmask = WordMetadata.POSITIONS_MASK; - private static final int unmodulatedPortion = 16; - - @Inject - public KeywordPositionBitmask(KeywordExtractor keywordExtractor, - DocumentLanguageData dld) - { - - // Mark the title words as position 0 - for (var sent : dld.titleSentences) { - int posBit = 1; - - for (var word : sent) { - positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var ngram : sent.ngramStemmed) { - positionMask.merge(ngram, posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - } - - // Mark subsequent sentences in subsequent positions, with increasing sentence step size - LinePosition linePos = new LinePosition(); - for (var sent : dld.sentences) { - - long posBit = (1L << linePos.pos()) & positionBitmask; - - for (var word : sent) { - positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var ngram : sent.ngramStemmed) { - positionMask.merge(ngram, posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - linePos.next(sent.length()); - } - } - - public long get(String stemmed) { - return positionMask.getOrDefault(stemmed, 0); - } - - private long bitwiseOr(long a, long b) { - return a | b; - } - - private static class LinePosition { - private int lineLengthCtr = 0; - private int bitMaskPos = 1; - - public int pos() { - if (bitMaskPos < unmodulatedPortion) { - return bitMaskPos; - } - else { - return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion)); - } - } - - public void next(int sentenceLength) - { - if (sentenceLength > 10) { - lineLengthCtr = 0; - ++bitMaskPos; - } - - lineLengthCtr += sentenceLength; - if (lineLengthCtr > 15) { - lineLengthCtr = 0; - ++bitMaskPos; - } - - } - - } -} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index 4d2b6d79..414813a8 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -4,12 +4,14 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; import lombok.Getter; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; +import org.roaringbitmap.RoaringBitmap; import java.util.*; @Getter public class DocumentKeywordsBuilder { - public final Object2LongLinkedOpenHashMap words; + public final Object2LongLinkedOpenHashMap wordToMeta; + public final HashMap wordToPos; /** These ware keywords that had signals of high relevance */ public final Set importantWords = new HashSet<>(); @@ -24,46 +26,53 @@ public class DocumentKeywordsBuilder { } public DocumentKeywords build() { - final String[] wordArray = new String[words.size()]; - final long[] meta = new long[words.size()]; + final String[] wordArray = new String[wordToMeta.size()]; + final long[] meta = new long[wordToMeta.size()]; + final RoaringBitmap[] positions = new RoaringBitmap[wordToMeta.size()]; - var iter = words.object2LongEntrySet().fastIterator(); + var iter = wordToMeta.object2LongEntrySet().fastIterator(); for (int i = 0; iter.hasNext(); i++) { var entry = iter.next(); meta[i] = entry.getLongValue(); wordArray[i] = entry.getKey(); + positions[i] = wordToPos.get(entry.getKey()); + if (positions[i] == null) { + positions[i] = new RoaringBitmap(); + } } - return new DocumentKeywords(wordArray, meta, null); + + return new DocumentKeywords(wordArray, meta, positions); } public DocumentKeywordsBuilder(int capacity) { - words = new Object2LongLinkedOpenHashMap<>(capacity); + wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity); + wordToPos = new HashMap<>(capacity); } - public void add(String word, long meta) { + public void addMeta(String word, long meta) { if (word.length() > MAX_WORD_LENGTH) return; - words.put(word, meta); + wordToMeta.put(word, meta); + } + + public void addPos(String word, int pos) { + if (word.length() > MAX_WORD_LENGTH) + return; + + wordToPos.computeIfAbsent(word, k -> new RoaringBitmap()).add(pos); } public void addImportantWords(Collection words) { importantWords.addAll(words); } - public void addJustNoMeta(String word) { - if (word.length() > MAX_WORD_LENGTH) - return; - - words.putIfAbsent(word, 0); - } - public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { flagWords.forEach(word -> - words.mergeLong(word, flag.asBit(), (a, b) -> a|b) + wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b) ); } @@ -72,7 +81,7 @@ public class DocumentKeywordsBuilder { // Only add the synthetic flag if the words aren't already present - newWords.forEach(word -> words.putIfAbsent(word, meta)); + newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta)); } public void addAnchorTerms(Map keywords) { @@ -82,11 +91,11 @@ public class DocumentKeywordsBuilder { keywords.forEach((word, count) -> { if (count > 5) { - words.mergeLong(word, flagC, (a, b) -> a|b); + wordToMeta.mergeLong(word, flagC, (a, b) -> a|b); } else if (count > 2) { - words.mergeLong(word, flagB, (a, b) -> a|b); + wordToMeta.mergeLong(word, flagB, (a, b) -> a|b); } else { - words.mergeLong(word, flagA, (a, b) -> a|b); + wordToMeta.mergeLong(word, flagA, (a, b) -> a|b); } }); } @@ -94,7 +103,7 @@ public class DocumentKeywordsBuilder { public List getWordsWithAnyFlag(long flags) { List ret = new ArrayList<>(); - for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) { + for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) { var entry = iter.next(); if ((flags & entry.getLongValue()) != 0) { ret.add(entry.getKey()); @@ -105,18 +114,18 @@ public class DocumentKeywordsBuilder { } public int size() { - return words.size(); + return Math.max(wordToMeta.size(), wordToPos.size()); } public WordMetadata getMetaForWord(String word) { - return new WordMetadata(words.getLong(word)); + return new WordMetadata(wordToMeta.getLong(word)); } + @Override public String toString() { StringBuilder sb = new StringBuilder("[ "); - words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' ')); + wordToMeta.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' ')); return sb.append(']').toString(); - } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java index 2bc068d9..181be165 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java @@ -1,5 +1,7 @@ package nu.marginalia.keyword.model; +import org.roaringbitmap.RoaringBitmap; + /** Pointer into a {@see DocumentKeywords}. It starts out before the first position, * forward with advancePointer(). * */ @@ -27,6 +29,11 @@ public class DocumentKeywordsPointer { return keywords.metadata[pos]; } + /** Return the positions associated with the current position */ + public RoaringBitmap getPositions() { + return keywords.positions[pos]; + } + /** Advance the current position, * returns false if this was the * last position */ diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 54577f80..fe60a0f1 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -10,6 +10,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import org.roaringbitmap.RoaringBitmap; import java.io.IOException; import java.net.URISyntaxException; @@ -21,10 +22,8 @@ import java.util.Set; class DocumentKeywordExtractorTest { - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(); + static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); @Test public void testWordPattern() { @@ -41,24 +40,6 @@ class DocumentKeywordExtractorTest { Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); } - - @Test - public void testEmptyMetadata() throws URISyntaxException { - var dld = se.extractSentences(""" - Some sample text, I'm not sure what even triggers this - """, "A title perhaps?"); - var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid")); - var keywords = keywordBuilder.build(); - - var pointer = keywords.newPointer(); - while (pointer.advancePointer()) { - if (pointer.getMetadata() == 0L) { - System.out.println("Aha! " + pointer.getKeyword()); - } - } - - } - @Test public void testKeyboards2() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), @@ -69,7 +50,7 @@ class DocumentKeywordExtractorTest { var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); - keywords.getWords().forEach((k, v) -> { + keywords.getWordToMeta().forEach((k, v) -> { if (k.contains("_")) { System.out.println(k + " " + new WordMetadata(v)); } @@ -112,21 +93,22 @@ class DocumentKeywordExtractorTest { var keywordsBuilt = keywords.build(); var ptr = keywordsBuilt.newPointer(); - Map dirtyAndBlues = new HashMap<>(); + Map flags = new HashMap<>(); + Map positions = new HashMap<>(); while (ptr.advancePointer()) { + System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions()); if (Set.of("dirty", "blues").contains(ptr.getKeyword())) { - Assertions.assertNull( - dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata())) - ); + flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata())); + positions.put(ptr.getKeyword(), ptr.getPositions()); } } - Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); - Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); + Assertions.assertTrue(flags.containsKey("dirty")); + Assertions.assertTrue(flags.containsKey("blues")); Assertions.assertNotEquals( - dirtyAndBlues.get("dirty"), - dirtyAndBlues.get("blues") + positions.get("dirty"), + positions.get("blues") ); } @@ -139,8 +121,7 @@ class DocumentKeywordExtractorTest { doc.filter(new DomPruningFilter(0.5)); DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); + new TermFrequencyDict(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index bfc78a9c..34b1b7af 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; class SentenceExtractorTest { static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - static NgramLexicon ngramLexicon = new NgramLexicon(lm); static SentenceExtractor se = new SentenceExtractor(lm); @SneakyThrows @@ -36,7 +35,7 @@ class SentenceExtractorTest { var dict = new TermFrequencyDict(lm); var url = new EdgeUrl("https://memex.marginalia.nu/"); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); for (;;) { long total = 0; diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java index cabe558f..f11eb304 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java @@ -26,9 +26,7 @@ class SummaryExtractorTest { @BeforeEach public void setUp() { - keywordExtractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); + keywordExtractor = new DocumentKeywordExtractor(); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); summaryExtractor = new SummaryExtractor(255, diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 32a0ec62..43ae0d81 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -69,7 +69,7 @@ public class SideloaderProcessing { ret.words = details.words(); for (String keyword : extraKeywords) - ret.words.add(keyword, WordFlags.Subjects.asBit()); + ret.words.addMeta(keyword, WordFlags.Subjects.asBit()); if (type == GeneratorType.WIKI) { ret.words.addAllSyntheticTerms(List.of("generator:wiki")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index d564b308..7f5c8b4b 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -166,7 +166,7 @@ public class RedditSideloader implements SideloadSource { } for (var keyword : extraKeywords) { - doc.words.add(keyword, WordFlags.Subjects.asBit()); + doc.words.addMeta(keyword, WordFlags.Subjects.asBit()); } // Insert topology information diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index dde7a106..639bb4bf 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -22,10 +22,8 @@ import java.nio.file.Path; public class SentenceStatisticsExperiment extends LegacyExperiment { - NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); Path filename; PrintWriter writer;