(convert) Initial integration of segmentation data into the converter's keyword extraction logic

This commit is contained in:
Viktor Lofgren 2024-03-19 14:28:42 +01:00
parent d8f4e7d72b
commit 0bd3365c24
22 changed files with 192 additions and 48 deletions

View File

@ -1,7 +1,10 @@
package nu.marginalia;
import lombok.Builder;
import java.nio.file.Path;
@Builder
public class LanguageModels {
public final Path termFrequencies;

View File

@ -32,6 +32,7 @@ dependencies {
implementation project(':third-party:commons-codec')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:link-graph:api')
implementation project(':code:functions:search-query')

View File

@ -5,7 +5,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorStep;
import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain;
import nu.marginalia.segmentation.NgramExtractorMain;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageType;
import org.slf4j.Logger;

View File

@ -19,6 +19,7 @@ dependencies {
implementation project(':code:common:process')
implementation project(':code:features-convert:keyword-extraction')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation libs.bundles.slf4j

View File

@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.util.TestLanguageModels;
import org.junit.jupiter.api.Test;

View File

@ -1,5 +1,6 @@
package nu.marginalia.keyword;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.keyword.extractors.*;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.language.model.DocumentLanguageData;
@ -15,11 +16,13 @@ public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final NgramLexicon ngramLexicon;
@Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) {
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
this.dict = dict;
this.ngramLexicon = ngramLexicon;
this.keywordExtractor = new KeywordExtractor();
}
@ -131,6 +134,17 @@ public class DocumentKeywordExtractor {
wordsBuilder.add(rep.word, meta);
}
for (int i = 0; i < sent.ngrams.length; i++) {
var ngram = sent.ngrams[i];
var ngramStemmed = sent.ngramStemmed[i];
long meta = metadata.getMetadataForWord(ngramStemmed);
assert meta != 0L : "Missing meta for " + ngram;
wordsBuilder.add(ngram, meta);
}
}
}

View File

@ -14,7 +14,9 @@ public class KeywordPositionBitmask {
private static final int unmodulatedPortion = 16;
@Inject
public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
DocumentLanguageData dld)
{
// Mark the title words as position 0
for (var sent : dld.titleSentences) {
@ -24,6 +26,10 @@ public class KeywordPositionBitmask {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
@ -43,6 +49,10 @@ public class KeywordPositionBitmask {
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var ngram : sent.ngramStemmed) {
positionMask.merge(ngram, posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Assertions;
@ -20,7 +21,9 @@ import java.util.Set;
class DocumentKeywordExtractorTest {
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
@Test
@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest {
}
@Test
public void testKeyboards2() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
"Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
keywords.getWords().forEach((k, v) -> {
if (k.contains("_")) {
System.out.println(k + " " + new WordMetadata(v));
}
});
}
@Test
public void testKeyboards() throws IOException, URISyntaxException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest {
var doc = Jsoup.parse(html);
doc.filter(new DomPruningFilter(0.5));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));

View File

@ -3,6 +3,7 @@ package nu.marginalia.keyword;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.WmsaHome;
import nu.marginalia.model.EdgeUrl;
@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
@Tag("slow")
class SentenceExtractorTest {
final LanguageModels lm = TestLanguageModels.getLanguageModels();
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
SentenceExtractor se = new SentenceExtractor(lm);
static NgramLexicon ngramLexicon = new NgramLexicon(lm);
static SentenceExtractor se = new SentenceExtractor(lm);
@SneakyThrows
public static void main(String... args) throws IOException {
@ -32,11 +34,9 @@ class SentenceExtractorTest {
System.out.println("Running");
SentenceExtractor se = new SentenceExtractor(lm);
var dict = new TermFrequencyDict(lm);
var url = new EdgeUrl("https://memex.marginalia.nu/");
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon);
for (;;) {
long total = 0;

View File

@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.summary.heuristic.*;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.jsoup.Jsoup;
@ -25,7 +26,9 @@ class SummaryExtractorTest {
@BeforeEach
public void setUp() {
keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
keywordExtractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()),
new NgramLexicon(WmsaHome.getLanguageModels()));
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
summaryExtractor = new SummaryExtractor(255,

View File

@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.apache.commons.lang3.StringUtils;

View File

@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
public final String[] wordsLowerCase;
public final String[] posTags;
public final String[] stemmedWords;
public final String[] ngrams;
public final String[] ngramStemmed;
private final BitSet isStopWord;
public SoftReference<WordSpan[]> keywords;
public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) {
public DocumentSentence(String originalSentence,
String[] words,
int[] separators,
String[] wordsLowerCase,
String[] posTags,
String[] stemmedWords,
String[] ngrams,
String[] ngramsStemmed
)
{
this.originalSentence = originalSentence;
this.words = words;
this.separators = separators;
@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
isStopWord = new BitSet(words.length);
this.ngrams = ngrams;
this.ngramStemmed = ngramsStemmed;
for (int i = 0; i < words.length; i++) {
if (WordPatterns.isStopWord(words[i]))
isStopWord.set(i);

View File

@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence;
import opennlp.tools.sentdetect.SentenceDetectorME;
@ -32,6 +33,8 @@ public class SentenceExtractor {
private SentenceDetectorME sentenceDetector;
private static RDRPOSTagger rdrposTagger;
private static NgramLexicon ngramLexicon = null;
private final PorterStemmer porterStemmer = new PorterStemmer();
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
@ -45,7 +48,8 @@ public class SentenceExtractor {
private static final int MAX_TEXT_LENGTH = 65536;
@SneakyThrows @Inject
public SentenceExtractor(LanguageModels models) {
public SentenceExtractor(LanguageModels models)
{
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
var sentenceModel = new SentenceModel(modelIn);
sentenceDetector = new SentenceDetectorME(sentenceModel);
@ -55,7 +59,9 @@ public class SentenceExtractor {
logger.error("Could not initialize sentence detector", ex);
}
synchronized (RDRPOSTagger.class) {
synchronized (this) {
ngramLexicon = new NgramLexicon(models);
try {
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
}
@ -128,8 +134,34 @@ public class SentenceExtractor {
var seps = wordsAndSeps.separators;
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, words);
String[] ngramsWords = new String[ngrams.size()];
String[] ngramsStemmedWords = new String[ngrams.size()];
for (int i = 0; i < ngrams.size(); i++) {
String[] ngram = ngrams.get(i);
StringJoiner ngramJoiner = new StringJoiner("_");
StringJoiner stemmedJoiner = new StringJoiner("_");
for (String s : ngram) {
ngramJoiner.add(s);
stemmedJoiner.add(porterStemmer.stem(s));
}
ngramsWords[i] = ngramJoiner.toString();
ngramsStemmedWords[i] = stemmedJoiner.toString();
}
return new DocumentSentence(
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
SentenceExtractorStringUtils.sanitizeString(text),
words,
seps,
lc,
rdrposTagger.tagsForEnSentence(words),
stemSentence(lc),
ngramsWords,
ngramsStemmedWords
);
}
@ -195,7 +227,35 @@ public class SentenceExtractor {
fullString = "";
}
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]);
String[] ngramsWords = new String[ngrams.size()];
String[] ngramsStemmedWords = new String[ngrams.size()];
for (int j = 0; j < ngrams.size(); j++) {
String[] ngram = ngrams.get(j);
StringJoiner ngramJoiner = new StringJoiner("_");
StringJoiner stemmedJoiner = new StringJoiner("_");
for (String s : ngram) {
ngramJoiner.add(s);
stemmedJoiner.add(porterStemmer.stem(s));
}
ngramsWords[j] = ngramJoiner.toString();
ngramsStemmedWords[j] = stemmedJoiner.toString();
}
ret[i] = new DocumentSentence(fullString,
tokens[i],
separators[i],
tokensLc[i],
posTags[i],
stemmedWords[i],
ngramsWords,
ngramsStemmedWords
);
}
return ret;
}

View File

@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':third-party:rdrpostagger')
implementation project(':third-party:porterstemmer')
implementation project(':third-party:commons-codec')
implementation project(':third-party:openzim')
implementation project(':third-party:monkey-patch-opennlp')
implementation project(':code:common:model')
implementation project(':code:common:config')

View File

@ -1,4 +1,4 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import ca.rmen.porterstemmer.PorterStemmer;
import org.apache.commons.lang3.StringUtils;

View File

@ -1,11 +1,11 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import nu.marginalia.hash.MurmurHash3_128;
/** A group of hash functions that can be used to hash a sequence of strings,
* that also has an inverse operation that can be used to remove a previously applied
* string from the sequence. */
sealed interface HasherGroup {
public sealed interface HasherGroup {
/** Apply a hash to the accumulator */
long apply(long acc, long add);

View File

@ -1,7 +1,6 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import nu.marginalia.WmsaHome;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.LanguageModels;
import java.io.IOException;
import java.nio.file.Path;
@ -15,10 +14,11 @@ public class NgramExporterMain {
}
static void trial() throws IOException {
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
NgramLexicon lexicon = new NgramLexicon();
lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin"));
NgramLexicon lexicon = new NgramLexicon(
LanguageModels.builder()
.segments(Path.of("/home/vlofgren/ngram-counts.bin"))
.build()
);
System.out.println("Loaded!");

View File

@ -1,4 +1,4 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.hash.MurmurHash3_128;

View File

@ -1,11 +1,13 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
import it.unimi.dsi.fastutil.longs.LongHash;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.LanguageModels;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
@ -16,11 +18,9 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@Singleton
public class NgramLexicon {
private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap(
100_000_000,
new KeyIsAlreadyHashStrategy()
);
private final Long2IntOpenCustomHashMap counts;
private final LongOpenHashSet permutations = new LongOpenHashSet();
private static final HasherGroup orderedHasher = HasherGroup.ordered();
@ -28,17 +28,35 @@ public class NgramLexicon {
@Inject
public NgramLexicon(LanguageModels models) {
try {
loadCounts(models.segments);
try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) {
long size = dis.readInt();
counts = new Long2IntOpenCustomHashMap(
(int) size,
new KeyIsAlreadyHashStrategy()
);
for (int i = 0; i < size; i++) {
counts.put(dis.readLong(), dis.readInt());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public NgramLexicon() {
counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy());
}
public List<String[]> findSegmentsStrings(int minLength, int maxLength, String... parts) {
List<SentenceSegment> segments = new ArrayList<>();
for (int i = minLength; i <= maxLength; i++) {
segments.addAll(findSegments(i, parts));
}
return segments.stream().map(seg -> seg.project(parts)).toList();
}
public List<SentenceSegment> findSegments(int length, String... parts) {
// Don't look for ngrams longer than the sentence
if (parts.length < length) return List.of();
@ -96,15 +114,6 @@ public class NgramLexicon {
permutations.add(hashUnordered);
}
public void loadCounts(Path path) throws IOException {
try (var dis = new DataInputStream(Files.newInputStream(path))) {
long size = dis.readInt();
for (int i = 0; i < size; i++) {
counts.put(dis.readLong(), dis.readInt());
}
}
}
public void loadPermutations(Path path) throws IOException {
try (var dis = new DataInputStream(Files.newInputStream(path))) {

View File

@ -1,5 +1,6 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import nu.marginalia.segmentation.HasherGroup;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;

View File

@ -1,4 +1,4 @@
package nu.marginalia.functions.searchquery.segmentation;
package nu.marginalia.segmentation;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

View File

@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.keyword.DocumentKeywordExtractor;
import nu.marginalia.language.sentence.SentenceExtractor;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.tools.LegacyExperiment;
import org.jsoup.Jsoup;
@ -21,8 +22,10 @@ import java.nio.file.Path;
public class SentenceStatisticsExperiment extends LegacyExperiment {
NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels());
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(
new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon);
Path filename;
PrintWriter writer;