mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(keywords) Add position information to keywords
This commit is contained in:
parent
0894822b68
commit
619392edf9
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -9,27 +9,32 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final NgramLexicon ngramLexicon;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
this.dict = dict;
|
||||
this.ngramLexicon = ngramLexicon;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
|
||||
// for tests
|
||||
public DocumentKeywordExtractor() {
|
||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
|
||||
|
||||
public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) {
|
||||
|
||||
var bitmask = new KeywordPositionBitmask(keywordExtractor, dld);
|
||||
var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld);
|
||||
|
||||
var titleKeywords = new TitleKeywords(keywordExtractor, dld);
|
||||
@ -39,7 +44,6 @@ public class DocumentKeywordExtractor {
|
||||
var urlKeywords = new UrlKeywords(url);
|
||||
|
||||
var keywordMetadata = KeywordMetadata.builder()
|
||||
.bitmask(bitmask)
|
||||
.tfIdfCounts(tfIdfCounts)
|
||||
.titleKeywords(titleKeywords)
|
||||
.nameLikeKeywords(nameLikeKeywords)
|
||||
@ -51,14 +55,14 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
createSimpleWords(wordsBuilder, keywordMetadata, dld);
|
||||
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords);
|
||||
createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords);
|
||||
|
||||
var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder);
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
|
||||
wordsBuilder.addImportantWords(importantWords);
|
||||
wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords());
|
||||
|
||||
return wordsBuilder;
|
||||
@ -77,36 +81,30 @@ public class DocumentKeywordExtractor {
|
||||
.sorted(tfIdfCounts.reversed())
|
||||
.limit(16)
|
||||
.filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100)
|
||||
.sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w)))
|
||||
.sorted(Comparator.comparing(tfIdfCounts::termFrequencyDictValue))
|
||||
.limit(6)
|
||||
.map(w -> w.word)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
WordReps words) {
|
||||
|
||||
private void createNGramTermsFromSet(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
WordReps words) {
|
||||
for (var rep : words.getReps()) {
|
||||
|
||||
var word = rep.word;
|
||||
|
||||
if (!word.isBlank()) {
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(word, meta);
|
||||
wordsBuilder.addMeta(word, meta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder,
|
||||
KeywordMetadata metadata,
|
||||
DocumentLanguageData documentLanguageData)
|
||||
{
|
||||
int pos = 0;
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
@ -119,10 +117,11 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
String w = word.wordLowerCase();
|
||||
if (matchesWordPattern(w)) {
|
||||
long meta = metadata.getMetadataForWord(word.stemmed());
|
||||
assert meta != 0L : "Missing meta for " + word.word();
|
||||
/* Add information about term positions */
|
||||
wordsBuilder.addPos(word.wordLowerCase(), pos++);
|
||||
|
||||
wordsBuilder.add(w, meta);
|
||||
/* Add metadata for word */
|
||||
wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed()));
|
||||
}
|
||||
}
|
||||
|
||||
@ -130,9 +129,8 @@ public class DocumentKeywordExtractor {
|
||||
var rep = new WordRep(sent, names);
|
||||
|
||||
long meta = metadata.getMetadataForWord(rep.stemmed);
|
||||
assert meta != 0L : "Missing meta for " + rep.word;
|
||||
|
||||
wordsBuilder.add(rep.word, meta);
|
||||
wordsBuilder.addMeta(rep.word, meta);
|
||||
}
|
||||
|
||||
for (int i = 0; i < sent.ngrams.length; i++) {
|
||||
@ -140,9 +138,8 @@ public class DocumentKeywordExtractor {
|
||||
var ngramStemmed = sent.ngramStemmed[i];
|
||||
|
||||
long meta = metadata.getMetadataForWord(ngramStemmed);
|
||||
assert meta != 0L : "Missing meta for " + ngram;
|
||||
|
||||
wordsBuilder.add(ngram, meta);
|
||||
wordsBuilder.addMeta(ngram, meta);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,14 +2,10 @@ package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
class KeywordMetadata {
|
||||
|
||||
private final KeywordPositionBitmask bitmask;
|
||||
private final TitleKeywords titleKeywords;
|
||||
private final NameLikeKeywords nameLikeKeywords;
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
@ -18,14 +14,12 @@ class KeywordMetadata {
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
KeywordPositionBitmask bitmask,
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords,
|
||||
WordsTfIdfCounts tfIdfCounts) {
|
||||
|
||||
this.bitmask = bitmask;
|
||||
WordsTfIdfCounts tfIdfCounts)
|
||||
{
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
@ -36,29 +30,33 @@ class KeywordMetadata {
|
||||
public long getMetadataForWord(String stemmed) {
|
||||
|
||||
int tfidf = tfIdfCounts.getTfIdf(stemmed);
|
||||
EnumSet<WordFlags> flags = EnumSet.noneOf(WordFlags.class);
|
||||
long flags = 0;
|
||||
|
||||
if (tfidf > 100)
|
||||
flags.add(WordFlags.TfIdfHigh);
|
||||
if (tfidf > 100) {
|
||||
flags |= WordFlags.TfIdfHigh.asBit();
|
||||
}
|
||||
|
||||
if (subjectLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Subjects);
|
||||
if (subjectLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Subjects.asBit();
|
||||
}
|
||||
|
||||
if (nameLikeKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.NamesWords);
|
||||
if (nameLikeKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.NamesWords.asBit();
|
||||
}
|
||||
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(WordFlags.Title);
|
||||
if (titleKeywords.contains(stemmed)) {
|
||||
flags |= WordFlags.Title.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsUrl(stemmed))
|
||||
flags.add(WordFlags.UrlPath);
|
||||
if (urlKeywords.containsUrl(stemmed)) {
|
||||
flags |= WordFlags.UrlPath.asBit();
|
||||
}
|
||||
|
||||
if (urlKeywords.containsDomain(stemmed))
|
||||
flags.add(WordFlags.UrlDomain);
|
||||
if (urlKeywords.containsDomain(stemmed)) {
|
||||
flags |= WordFlags.UrlDomain.asBit();
|
||||
}
|
||||
|
||||
long positions = bitmask.get(stemmed);
|
||||
|
||||
return new WordMetadata(positions, flags).encode();
|
||||
return flags;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,105 +0,0 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
|
||||
/** Generates a position bitmask for each word in a document */
|
||||
public class KeywordPositionBitmask {
|
||||
private final Object2LongOpenHashMap<String> positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f);
|
||||
private final static int positionWidth = WordMetadata.POSITIONS_COUNT;
|
||||
private final static long positionBitmask = WordMetadata.POSITIONS_MASK;
|
||||
private static final int unmodulatedPortion = 16;
|
||||
|
||||
@Inject
|
||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld)
|
||||
{
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = 1;
|
||||
|
||||
for (var word : sent) {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
|
||||
// Mark subsequent sentences in subsequent positions, with increasing sentence step size
|
||||
LinePosition linePos = new LinePosition();
|
||||
for (var sent : dld.sentences) {
|
||||
|
||||
long posBit = (1L << linePos.pos()) & positionBitmask;
|
||||
|
||||
for (var word : sent) {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
linePos.next(sent.length());
|
||||
}
|
||||
}
|
||||
|
||||
public long get(String stemmed) {
|
||||
return positionMask.getOrDefault(stemmed, 0);
|
||||
}
|
||||
|
||||
private long bitwiseOr(long a, long b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
private static class LinePosition {
|
||||
private int lineLengthCtr = 0;
|
||||
private int bitMaskPos = 1;
|
||||
|
||||
public int pos() {
|
||||
if (bitMaskPos < unmodulatedPortion) {
|
||||
return bitMaskPos;
|
||||
}
|
||||
else {
|
||||
return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion));
|
||||
}
|
||||
}
|
||||
|
||||
public void next(int sentenceLength)
|
||||
{
|
||||
if (sentenceLength > 10) {
|
||||
lineLengthCtr = 0;
|
||||
++bitMaskPos;
|
||||
}
|
||||
|
||||
lineLengthCtr += sentenceLength;
|
||||
if (lineLengthCtr > 15) {
|
||||
lineLengthCtr = 0;
|
||||
++bitMaskPos;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -4,12 +4,14 @@ import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Getter
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final Object2LongLinkedOpenHashMap<String> words;
|
||||
public final Object2LongLinkedOpenHashMap<String> wordToMeta;
|
||||
public final HashMap<String, RoaringBitmap> wordToPos;
|
||||
|
||||
/** These ware keywords that had signals of high relevance */
|
||||
public final Set<String> importantWords = new HashSet<>();
|
||||
@ -24,46 +26,53 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
public DocumentKeywords build() {
|
||||
final String[] wordArray = new String[words.size()];
|
||||
final long[] meta = new long[words.size()];
|
||||
final String[] wordArray = new String[wordToMeta.size()];
|
||||
final long[] meta = new long[wordToMeta.size()];
|
||||
final RoaringBitmap[] positions = new RoaringBitmap[wordToMeta.size()];
|
||||
|
||||
var iter = words.object2LongEntrySet().fastIterator();
|
||||
var iter = wordToMeta.object2LongEntrySet().fastIterator();
|
||||
|
||||
for (int i = 0; iter.hasNext(); i++) {
|
||||
var entry = iter.next();
|
||||
|
||||
meta[i] = entry.getLongValue();
|
||||
wordArray[i] = entry.getKey();
|
||||
positions[i] = wordToPos.get(entry.getKey());
|
||||
if (positions[i] == null) {
|
||||
positions[i] = new RoaringBitmap();
|
||||
}
|
||||
}
|
||||
|
||||
return new DocumentKeywords(wordArray, meta, null);
|
||||
|
||||
return new DocumentKeywords(wordArray, meta, positions);
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
words = new Object2LongLinkedOpenHashMap<>(capacity);
|
||||
wordToMeta = new Object2LongLinkedOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
public void add(String word, long meta) {
|
||||
public void addMeta(String word, long meta) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.put(word, meta);
|
||||
wordToMeta.put(word, meta);
|
||||
}
|
||||
|
||||
public void addPos(String word, int pos) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
wordToPos.computeIfAbsent(word, k -> new RoaringBitmap()).add(pos);
|
||||
}
|
||||
|
||||
public void addImportantWords(Collection<String> words) {
|
||||
importantWords.addAll(words);
|
||||
}
|
||||
|
||||
public void addJustNoMeta(String word) {
|
||||
if (word.length() > MAX_WORD_LENGTH)
|
||||
return;
|
||||
|
||||
words.putIfAbsent(word, 0);
|
||||
}
|
||||
|
||||
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
|
||||
flagWords.forEach(word ->
|
||||
words.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
||||
wordToMeta.mergeLong(word, flag.asBit(), (a, b) -> a|b)
|
||||
);
|
||||
}
|
||||
|
||||
@ -72,7 +81,7 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
// Only add the synthetic flag if the words aren't already present
|
||||
|
||||
newWords.forEach(word -> words.putIfAbsent(word, meta));
|
||||
newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta));
|
||||
}
|
||||
|
||||
public void addAnchorTerms(Map<String, Integer> keywords) {
|
||||
@ -82,11 +91,11 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
keywords.forEach((word, count) -> {
|
||||
if (count > 5) {
|
||||
words.mergeLong(word, flagC, (a, b) -> a|b);
|
||||
wordToMeta.mergeLong(word, flagC, (a, b) -> a|b);
|
||||
} else if (count > 2) {
|
||||
words.mergeLong(word, flagB, (a, b) -> a|b);
|
||||
wordToMeta.mergeLong(word, flagB, (a, b) -> a|b);
|
||||
} else {
|
||||
words.mergeLong(word, flagA, (a, b) -> a|b);
|
||||
wordToMeta.mergeLong(word, flagA, (a, b) -> a|b);
|
||||
}
|
||||
});
|
||||
}
|
||||
@ -94,7 +103,7 @@ public class DocumentKeywordsBuilder {
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) {
|
||||
for (var iter = wordToMeta.object2LongEntrySet().fastIterator(); iter.hasNext();) {
|
||||
var entry = iter.next();
|
||||
if ((flags & entry.getLongValue()) != 0) {
|
||||
ret.add(entry.getKey());
|
||||
@ -105,18 +114,18 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return words.size();
|
||||
return Math.max(wordToMeta.size(), wordToPos.size());
|
||||
}
|
||||
|
||||
public WordMetadata getMetaForWord(String word) {
|
||||
return new WordMetadata(words.getLong(word));
|
||||
return new WordMetadata(wordToMeta.getLong(word));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("[ ");
|
||||
words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
|
||||
wordToMeta.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
|
||||
return sb.append(']').toString();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.keyword.model;
|
||||
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
/** Pointer into a {@see DocumentKeywords}. It starts out before the first position,
|
||||
* forward with advancePointer().
|
||||
* */
|
||||
@ -27,6 +29,11 @@ public class DocumentKeywordsPointer {
|
||||
return keywords.metadata[pos];
|
||||
}
|
||||
|
||||
/** Return the positions associated with the current position */
|
||||
public RoaringBitmap getPositions() {
|
||||
return keywords.positions[pos];
|
||||
}
|
||||
|
||||
/** Advance the current position,
|
||||
* returns false if this was the
|
||||
* last position */
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
@ -21,10 +22,8 @@ import java.util.Set;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor();
|
||||
static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Test
|
||||
public void testWordPattern() {
|
||||
@ -41,24 +40,6 @@ class DocumentKeywordExtractorTest {
|
||||
Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testEmptyMetadata() throws URISyntaxException {
|
||||
var dld = se.extractSentences("""
|
||||
Some sample text, I'm not sure what even triggers this
|
||||
""", "A title perhaps?");
|
||||
var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
|
||||
var keywords = keywordBuilder.build();
|
||||
|
||||
var pointer = keywords.newPointer();
|
||||
while (pointer.advancePointer()) {
|
||||
if (pointer.getMetadata() == 0L) {
|
||||
System.out.println("Aha! " + pointer.getKeyword());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
@ -69,7 +50,7 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
|
||||
keywords.getWords().forEach((k, v) -> {
|
||||
keywords.getWordToMeta().forEach((k, v) -> {
|
||||
if (k.contains("_")) {
|
||||
System.out.println(k + " " + new WordMetadata(v));
|
||||
}
|
||||
@ -112,21 +93,22 @@ class DocumentKeywordExtractorTest {
|
||||
var keywordsBuilt = keywords.build();
|
||||
var ptr = keywordsBuilt.newPointer();
|
||||
|
||||
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
|
||||
Map<String, WordMetadata> flags = new HashMap<>();
|
||||
Map<String, RoaringBitmap> positions = new HashMap<>();
|
||||
|
||||
while (ptr.advancePointer()) {
|
||||
System.out.println(ptr.getKeyword() + " " + ptr.getMetadata() + " " + ptr.getPositions());
|
||||
if (Set.of("dirty", "blues").contains(ptr.getKeyword())) {
|
||||
Assertions.assertNull(
|
||||
dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()))
|
||||
);
|
||||
flags.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata()));
|
||||
positions.put(ptr.getKeyword(), ptr.getPositions());
|
||||
}
|
||||
}
|
||||
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
|
||||
Assertions.assertTrue(flags.containsKey("dirty"));
|
||||
Assertions.assertTrue(flags.containsKey("blues"));
|
||||
Assertions.assertNotEquals(
|
||||
dirtyAndBlues.get("dirty"),
|
||||
dirtyAndBlues.get("blues")
|
||||
positions.get("dirty"),
|
||||
positions.get("blues")
|
||||
);
|
||||
}
|
||||
|
||||
@ -139,8 +121,7 @@ class DocumentKeywordExtractorTest {
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
|
||||
|
@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
class SentenceExtractorTest {
|
||||
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
static NgramLexicon ngramLexicon = new NgramLexicon(lm);
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
@SneakyThrows
|
||||
@ -36,7 +35,7 @@ class SentenceExtractorTest {
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
|
||||
for (;;) {
|
||||
long total = 0;
|
||||
|
@ -26,9 +26,7 @@ class SummaryExtractorTest {
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
keywordExtractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
keywordExtractor = new DocumentKeywordExtractor();
|
||||
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
summaryExtractor = new SummaryExtractor(255,
|
||||
|
@ -69,7 +69,7 @@ public class SideloaderProcessing {
|
||||
ret.words = details.words();
|
||||
|
||||
for (String keyword : extraKeywords)
|
||||
ret.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
ret.words.addMeta(keyword, WordFlags.Subjects.asBit());
|
||||
|
||||
if (type == GeneratorType.WIKI) {
|
||||
ret.words.addAllSyntheticTerms(List.of("generator:wiki"));
|
||||
|
@ -166,7 +166,7 @@ public class RedditSideloader implements SideloadSource {
|
||||
}
|
||||
|
||||
for (var keyword : extraKeywords) {
|
||||
doc.words.add(keyword, WordFlags.Subjects.asBit());
|
||||
doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
|
||||
}
|
||||
|
||||
// Insert topology information
|
||||
|
@ -22,10 +22,8 @@ import java.nio.file.Path;
|
||||
|
||||
public class SentenceStatisticsExperiment extends LegacyExperiment {
|
||||
|
||||
NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels());
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
Path filename;
|
||||
PrintWriter writer;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user