(keywords) Add position information to keywords

This commit is contained in:
Viktor Lofgren 2024-05-28 16:54:53 +02:00
parent 0894822b68
commit 619392edf9
11 changed files with 109 additions and 227 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.keyword; package nu.marginalia.keyword;
import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
@ -9,27 +9,32 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import com.google.inject.Inject; import com.google.inject.Inject;
import java.util.*; import java.util.*;
import java.util.stream.Stream; import java.util.stream.Stream;
public class DocumentKeywordExtractor { public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict; private final TermFrequencyDict dict;
private final NgramLexicon ngramLexicon;
@Inject @Inject
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict; this.dict = dict;
this.ngramLexicon = ngramLexicon; this.keywordExtractor = new KeywordExtractor();
}
// for tests
public DocumentKeywordExtractor() {
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
this.keywordExtractor = new KeywordExtractor(); this.keywordExtractor = new KeywordExtractor();
} }
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) { public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld); var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
var titleKeywords = new TitleKeywords(keywordExtractor, dld); var titleKeywords = new TitleKeywords(keywordExtractor, dld);
@ -39,7 +44,6 @@ public class DocumentKeywordExtractor {
var urlKeywords = new UrlKeywords(url); var urlKeywords = new UrlKeywords(url);
var keywordMetadata = KeywordMetadata.builder() var keywordMetadata = KeywordMetadata.builder()
.bitmask(bitmask)
.tfIdfCounts(tfIdfCounts) .tfIdfCounts(tfIdfCounts)
.titleKeywords(titleKeywords) .titleKeywords(titleKeywords)
.nameLikeKeywords(nameLikeKeywords) .nameLikeKeywords(nameLikeKeywords)
@ -51,14 +55,14 @@ public class DocumentKeywordExtractor {
createSimpleWords(wordsBuilder, keywordMetadata, dld); createSimpleWords(wordsBuilder, keywordMetadata, dld);
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts); createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder); var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addImportantWords(importantWords);
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords()); wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
return wordsBuilder; return wordsBuilder;
@ -77,36 +81,30 @@ public class DocumentKeywordExtractor {
.sorted(tfIdfCounts.reversed()) .sorted(tfIdfCounts.reversed())
.limit(16) .limit(16)
.filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100) .filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100)
.sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w))) .sorted(Comparator.comparing(tfIdfCounts::termFrequencyDictValue))
.limit(6) .limit(6)
.map(w -> w.word) .map(w -> w.word)
.toList(); .toList();
} }
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder, private void createNGramTermsFromSet(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata, KeywordMetadata metadata,
WordReps words) { WordReps words) {
for (var rep : words.getReps()) { for (var rep : words.getReps()) {
var word = rep.word; var word = rep.word;
if (!word.isBlank()) { if (!word.isBlank()) {
long meta = metadata.getMetadataForWord(rep.stemmed); long meta = metadata.getMetadataForWord(rep.stemmed);
wordsBuilder.addMeta(word, meta);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(word, meta);
} }
} }
} }
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
KeywordMetadata metadata, KeywordMetadata metadata,
DocumentLanguageData documentLanguageData) DocumentLanguageData documentLanguageData)
{ {
int pos = 0;
for (var sent : documentLanguageData.sentences) { for (var sent : documentLanguageData.sentences) {
if (wordsBuilder.size() > 1500) if (wordsBuilder.size() > 1500)
@ -119,10 +117,11 @@ public class DocumentKeywordExtractor {
String w = word.wordLowerCase(); String w = word.wordLowerCase();
if (matchesWordPattern(w)) { if (matchesWordPattern(w)) {
long meta = metadata.getMetadataForWord(word.stemmed()); /* Add information about term positions */
assert meta != 0L : "Missing meta for " + word.word(); wordsBuilder.addPos(word.wordLowerCase(), pos++);
wordsBuilder.add(w, meta); /* Add metadata for word */
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
} }
} }
@ -130,9 +129,8 @@ public class DocumentKeywordExtractor {
var rep = new WordRep(sent, names); var rep = new WordRep(sent, names);
long meta = metadata.getMetadataForWord(rep.stemmed); long meta = metadata.getMetadataForWord(rep.stemmed);
assert meta != 0L : "Missing meta for " + rep.word;
wordsBuilder.add(rep.word, meta); wordsBuilder.addMeta(rep.word, meta);
} }
for (int i = 0; i < sent.ngrams.length; i++) { for (int i = 0; i < sent.ngrams.length; i++) {
@ -140,9 +138,8 @@ public class DocumentKeywordExtractor {
var ngramStemmed = sent.ngramStemmed[i]; var ngramStemmed = sent.ngramStemmed[i];
long meta = metadata.getMetadataForWord(ngramStemmed); long meta = metadata.getMetadataForWord(ngramStemmed);
assert meta != 0L : "Missing meta for " + ngram;
wordsBuilder.add(ngram, meta); wordsBuilder.addMeta(ngram, meta);
} }
} }

View File

@ -2,14 +2,10 @@ package nu.marginalia.keyword;
import lombok.Builder; import lombok.Builder;
import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.extractors.*;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import java.util.EnumSet;
class KeywordMetadata { class KeywordMetadata {
private final KeywordPositionBitmask bitmask;
private final TitleKeywords titleKeywords; private final TitleKeywords titleKeywords;
private final NameLikeKeywords nameLikeKeywords; private final NameLikeKeywords nameLikeKeywords;
private final SubjectLikeKeywords subjectLikeKeywords; private final SubjectLikeKeywords subjectLikeKeywords;
@ -18,14 +14,12 @@ class KeywordMetadata {
@Builder @Builder
public KeywordMetadata( public KeywordMetadata(
KeywordPositionBitmask bitmask,
TitleKeywords titleKeywords, TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords, NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords, SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords, UrlKeywords urlKeywords,
WordsTfIdfCounts tfIdfCounts) { WordsTfIdfCounts tfIdfCounts)
{
this.bitmask = bitmask;
this.titleKeywords = titleKeywords; this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords; this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords; this.subjectLikeKeywords = subjectLikeKeywords;
@ -36,29 +30,33 @@ class KeywordMetadata {
public long getMetadataForWord(String stemmed) { public long getMetadataForWord(String stemmed) {
int tfidf = tfIdfCounts.getTfIdf(stemmed); int tfidf = tfIdfCounts.getTfIdf(stemmed);
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class); long flags = 0;
if (tfidf > 100) if (tfidf > 100) {
flags.add(WordFlags.TfIdfHigh); flags |= WordFlags.TfIdfHigh.asBit();
}
if (subjectLikeKeywords.contains(stemmed)) if (subjectLikeKeywords.contains(stemmed)) {
flags.add(WordFlags.Subjects); flags |= WordFlags.Subjects.asBit();
}
if (nameLikeKeywords.contains(stemmed)) if (nameLikeKeywords.contains(stemmed)) {
flags.add(WordFlags.NamesWords); flags |= WordFlags.NamesWords.asBit();
}
if (titleKeywords.contains(stemmed)) if (titleKeywords.contains(stemmed)) {
flags.add(WordFlags.Title); flags |= WordFlags.Title.asBit();
}
if (urlKeywords.containsUrl(stemmed)) if (urlKeywords.containsUrl(stemmed)) {
flags.add(WordFlags.UrlPath); flags |= WordFlags.UrlPath.asBit();
}
if (urlKeywords.containsDomain(stemmed)) if (urlKeywords.containsDomain(stemmed)) {
flags.add(WordFlags.UrlDomain); flags |= WordFlags.UrlDomain.asBit();
}
long positions = bitmask.get(stemmed); return flags;
return new WordMetadata(positions, flags).encode();
} }
} }

View File

@ -1,105 +0,0 @@
package nu.marginalia.keyword.extractors;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.model.idx.WordMetadata;
/** Generates a position bitmask for each word in a document */
public class KeywordPositionBitmask {
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
private static final int unmodulatedPortion = 16;
@Inject
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
DocumentLanguageData dld)
{
// Mark the title words as position 0
for (var sent : dld.titleSentences) {
int posBit = 1;
for (var word : sent) {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
LinePosition linePos = new LinePosition();
for (var sent : dld.sentences) {
long posBit = (1L << linePos.pos()) & positionBitmask;
for (var word : sent) {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getProperNames(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
linePos.next(sent.length());
}
}
public long get(String stemmed) {
return positionMask.getOrDefault(stemmed, 0);
}
private long bitwiseOr(long a, long b) {
return a | b;
}
private static class LinePosition {
private int lineLengthCtr = 0;
private int bitMaskPos = 1;
public int pos() {
if (bitMaskPos < unmodulatedPortion) {
return bitMaskPos;
}
else {
return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion));
}
}
public void next(int sentenceLength)
{
if (sentenceLength > 10) {
lineLengthCtr = 0;
++bitMaskPos;
}
lineLengthCtr += sentenceLength;
if (lineLengthCtr > 15) {
lineLengthCtr = 0;
++bitMaskPos;
}
}
}
}

View File

@ -4,12 +4,14 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import lombok.Getter; import lombok.Getter;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import org.roaringbitmap.RoaringBitmap;
import java.util.*; import java.util.*;
@Getter @Getter
public class DocumentKeywordsBuilder { public class DocumentKeywordsBuilder {
public final Object2LongLinkedOpenHashMap<String> words; public final Object2LongLinkedOpenHashMap<String> wordToMeta;
public final HashMap<String, RoaringBitmap> wordToPos;
/** These ware keywords that had signals of high relevance */ /** These ware keywords that had signals of high relevance */
public final Set<String> importantWords = new HashSet<>(); public final Set<String> importantWords = new HashSet<>();
@ -24,46 +26,53 @@ public class DocumentKeywordsBuilder {
} }
public DocumentKeywords build() { public DocumentKeywords build() {
final String[] wordArray = new String[words.size()]; final String[] wordArray = new String[wordToMeta.size()];
final long[] meta = new long[words.size()]; final long[] meta = new long[wordToMeta.size()];
final RoaringBitmap[] positions = new RoaringBitmap[wordToMeta.size()];
var iter = words.object2LongEntrySet().fastIterator(); var iter = wordToMeta.object2LongEntrySet().fastIterator();
for (int i = 0; iter.hasNext(); i++) { for (int i = 0; iter.hasNext(); i++) {
var entry = iter.next(); var entry = iter.next();
meta[i] = entry.getLongValue(); meta[i] = entry.getLongValue();
wordArray[i] = entry.getKey(); wordArray[i] = entry.getKey();
positions[i] = wordToPos.get(entry.getKey());
if (positions[i] == null) {
positions[i] = new RoaringBitmap();
}
} }
return new DocumentKeywords(wordArray, meta, null);
return new DocumentKeywords(wordArray, meta, positions);
} }
public DocumentKeywordsBuilder(int capacity) { public DocumentKeywordsBuilder(int capacity) {
words = new Object2LongLinkedOpenHashMap<>(capacity); wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity);
} }
public void add(String word, long meta) { public void addMeta(String word, long meta) {
if (word.length() > MAX_WORD_LENGTH) if (word.length() > MAX_WORD_LENGTH)
return; return;
words.put(word, meta); wordToMeta.put(word, meta);
}
public void addPos(String word, int pos) {
if (word.length() > MAX_WORD_LENGTH)
return;
wordToPos.computeIfAbsent(word, k -> new RoaringBitmap()).add(pos);
} }
public void addImportantWords(Collection<String> words) { public void addImportantWords(Collection<String> words) {
importantWords.addAll(words); importantWords.addAll(words);
} }
public void addJustNoMeta(String word) {
if (word.length() > MAX_WORD_LENGTH)
return;
words.putIfAbsent(word, 0);
}
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) { public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
flagWords.forEach(word -> flagWords.forEach(word ->
words.mergeLong(word, flag.asBit(), (a, b) -> a|b) wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b)
); );
} }
@ -72,7 +81,7 @@ public class DocumentKeywordsBuilder {
// Only add the synthetic flag if the words aren't already present // Only add the synthetic flag if the words aren't already present
newWords.forEach(word -> words.putIfAbsent(word, meta)); newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
} }
public void addAnchorTerms(Map<String, Integer> keywords) { public void addAnchorTerms(Map<String, Integer> keywords) {
@ -82,11 +91,11 @@ public class DocumentKeywordsBuilder {
keywords.forEach((word, count) -> { keywords.forEach((word, count) -> {
if (count > 5) { if (count > 5) {
words.mergeLong(word, flagC, (a, b) -> a|b); wordToMeta.mergeLong(word, flagC, (a, b) -> a|b);
} else if (count > 2) { } else if (count > 2) {
words.mergeLong(word, flagB, (a, b) -> a|b); wordToMeta.mergeLong(word, flagB, (a, b) -> a|b);
} else { } else {
words.mergeLong(word, flagA, (a, b) -> a|b); wordToMeta.mergeLong(word, flagA, (a, b) -> a|b);
} }
}); });
} }
@ -94,7 +103,7 @@ public class DocumentKeywordsBuilder {
public List<String> getWordsWithAnyFlag(long flags) { public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>(); List<String> ret = new ArrayList<>();
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) { for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) {
var entry = iter.next(); var entry = iter.next();
if ((flags & entry.getLongValue()) != 0) { if ((flags & entry.getLongValue()) != 0) {
ret.add(entry.getKey()); ret.add(entry.getKey());
@ -105,18 +114,18 @@ public class DocumentKeywordsBuilder {
} }
public int size() { public int size() {
return words.size(); return Math.max(wordToMeta.size(), wordToPos.size());
} }
public WordMetadata getMetaForWord(String word) { public WordMetadata getMetaForWord(String word) {
return new WordMetadata(words.getLong(word)); return new WordMetadata(wordToMeta.getLong(word));
} }
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder("[ "); StringBuilder sb = new StringBuilder("[ ");
words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' ')); wordToMeta.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
return sb.append(']').toString(); return sb.append(']').toString();
} }
} }

View File

@ -1,5 +1,7 @@
package nu.marginalia.keyword.model; package nu.marginalia.keyword.model;
import org.roaringbitmap.RoaringBitmap;
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position, /** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
* forward with advancePointer(). * forward with advancePointer().
* */ * */
@ -27,6 +29,11 @@ public class DocumentKeywordsPointer {
return keywords.metadata[pos]; return keywords.metadata[pos];
} }
/** Return the positions associated with the current position */
public RoaringBitmap getPositions() {
return keywords.positions[pos];
}
/** Advance the current position, /** Advance the current position,
* returns false if this was the * returns false if this was the
* last position */ * last position */

View File

@ -10,6 +10,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.roaringbitmap.RoaringBitmap;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
@ -21,10 +22,8 @@ import java.util.Set;
class DocumentKeywordExtractorTest { class DocumentKeywordExtractorTest {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
new TermFrequencyDict(WmsaHome.getLanguageModels()), static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test @Test
public void testWordPattern() { public void testWordPattern() {
@ -41,24 +40,6 @@ class DocumentKeywordExtractorTest {
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
} }
@Test
public void testEmptyMetadata() throws URISyntaxException {
var dld = se.extractSentences("""
Some sample text, I'm not sure what even triggers this
""", "A title perhaps?");
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
var keywords = keywordBuilder.build();
var pointer = keywords.newPointer();
while (pointer.advancePointer()) {
if (pointer.getMetadata() == 0L) {
System.out.println("Aha! " + pointer.getKeyword());
}
}
}
@Test @Test
public void testKeyboards2() throws IOException, URISyntaxException { public void testKeyboards2() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
@ -69,7 +50,7 @@ class DocumentKeywordExtractorTest {
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
keywords.getWords().forEach((k, v) -> { keywords.getWordToMeta().forEach((k, v) -> {
if (k.contains("_")) { if (k.contains("_")) {
System.out.println(k + " " + new WordMetadata(v)); System.out.println(k + " " + new WordMetadata(v));
} }
@ -112,21 +93,22 @@ class DocumentKeywordExtractorTest {
var keywordsBuilt = keywords.build(); var keywordsBuilt = keywords.build();
var ptr = keywordsBuilt.newPointer(); var ptr = keywordsBuilt.newPointer();
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>(); Map<String, WordMetadata> flags = new HashMap<>();
Map<String, RoaringBitmap> positions = new HashMap<>();
while (ptr.advancePointer()) { while (ptr.advancePointer()) {
System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions());
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) { if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
Assertions.assertNull( flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()));
dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata())) positions.put(ptr.getKeyword(), ptr.getPositions());
);
} }
} }
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); Assertions.assertTrue(flags.containsKey("dirty"));
Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); Assertions.assertTrue(flags.containsKey("blues"));
Assertions.assertNotEquals( Assertions.assertNotEquals(
dirtyAndBlues.get("dirty"), positions.get("dirty"),
dirtyAndBlues.get("blues") positions.get("blues")
); );
} }
@ -139,8 +121,7 @@ class DocumentKeywordExtractorTest {
doc.filter(new DomPruningFilter(0.5)); doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()), new TermFrequencyDict(WmsaHome.getLanguageModels()));
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));

View File

@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
class SentenceExtractorTest { class SentenceExtractorTest {
static final LanguageModels lm = TestLanguageModels.getLanguageModels(); static final LanguageModels lm = TestLanguageModels.getLanguageModels();
static NgramLexicon ngramLexicon = new NgramLexicon(lm);
static SentenceExtractor se = new SentenceExtractor(lm); static SentenceExtractor se = new SentenceExtractor(lm);
@SneakyThrows @SneakyThrows
@ -36,7 +35,7 @@ class SentenceExtractorTest {
var dict = new TermFrequencyDict(lm); var dict = new TermFrequencyDict(lm);
var url = new EdgeUrl("https://memex.marginalia.nu/"); var url = new EdgeUrl("https://memex.marginalia.nu/");
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
for (;;) { for (;;) {
long total = 0; long total = 0;

View File

@ -26,9 +26,7 @@ class SummaryExtractorTest {
@BeforeEach @BeforeEach
public void setUp() { public void setUp() {
keywordExtractor = new DocumentKeywordExtractor( keywordExtractor = new DocumentKeywordExtractor();
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
summaryExtractor = new SummaryExtractor(255, summaryExtractor = new SummaryExtractor(255,

View File

@ -69,7 +69,7 @@ public class SideloaderProcessing {
ret.words = details.words(); ret.words = details.words();
for (String keyword : extraKeywords) for (String keyword : extraKeywords)
ret.words.add(keyword, WordFlags.Subjects.asBit()); ret.words.addMeta(keyword, WordFlags.Subjects.asBit());
if (type == GeneratorType.WIKI) { if (type == GeneratorType.WIKI) {
ret.words.addAllSyntheticTerms(List.of("generator:wiki")); ret.words.addAllSyntheticTerms(List.of("generator:wiki"));

View File

@ -166,7 +166,7 @@ public class RedditSideloader implements SideloadSource {
} }
for (var keyword : extraKeywords) { for (var keyword : extraKeywords) {
doc.words.add(keyword, WordFlags.Subjects.asBit()); doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
} }
// Insert topology information // Insert topology information

View File

@ -22,10 +22,8 @@ import java.nio.file.Path;
public class SentenceStatisticsExperiment extends LegacyExperiment { public class SentenceStatisticsExperiment extends LegacyExperiment {
NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels());
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon);
Path filename; Path filename;
PrintWriter writer; PrintWriter writer;