mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(convert) Initial integration of segmentation data into the converter's keyword extraction logic
This commit is contained in:
parent
d8f4e7d72b
commit
0bd3365c24
@ -1,7 +1,10 @@
|
|||||||
package nu.marginalia;
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import lombok.Builder;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
@Builder
|
||||||
public class LanguageModels {
|
public class LanguageModels {
|
||||||
public final Path termFrequencies;
|
public final Path termFrequencies;
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ dependencies {
|
|||||||
implementation project(':third-party:commons-codec')
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
|
||||||
implementation project(':code:functions:link-graph:api')
|
implementation project(':code:functions:link-graph:api')
|
||||||
implementation project(':code:functions:search-query')
|
implementation project(':code:functions:search-query')
|
||||||
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain;
|
import nu.marginalia.segmentation.NgramExtractorMain;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -19,6 +19,7 @@ dependencies {
|
|||||||
implementation project(':code:common:process')
|
implementation project(':code:common:process')
|
||||||
implementation project(':code:features-convert:keyword-extraction')
|
implementation project(':code:features-convert:keyword-extraction')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.keyword;
|
package nu.marginalia.keyword;
|
||||||
|
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.keyword.extractors.*;
|
import nu.marginalia.keyword.extractors.*;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
@ -15,11 +16,13 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final TermFrequencyDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
private final NgramLexicon ngramLexicon;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
this.ngramLexicon = ngramLexicon;
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
this.keywordExtractor = new KeywordExtractor();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,6 +134,17 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
wordsBuilder.add(rep.word, meta);
|
wordsBuilder.add(rep.word, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < sent.ngrams.length; i++) {
|
||||||
|
var ngram = sent.ngrams[i];
|
||||||
|
var ngramStemmed = sent.ngramStemmed[i];
|
||||||
|
|
||||||
|
long meta = metadata.getMetadataForWord(ngramStemmed);
|
||||||
|
assert meta != 0L : "Missing meta for " + ngram;
|
||||||
|
|
||||||
|
wordsBuilder.add(ngram, meta);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,9 @@ public class KeywordPositionBitmask {
|
|||||||
private static final int unmodulatedPortion = 16;
|
private static final int unmodulatedPortion = 16;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
|
||||||
|
DocumentLanguageData dld)
|
||||||
|
{
|
||||||
|
|
||||||
// Mark the title words as position 0
|
// Mark the title words as position 0
|
||||||
for (var sent : dld.titleSentences) {
|
for (var sent : dld.titleSentences) {
|
||||||
@ -24,6 +26,10 @@ public class KeywordPositionBitmask {
|
|||||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (var ngram : sent.ngramStemmed) {
|
||||||
|
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
|
||||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
@ -43,6 +49,10 @@ public class KeywordPositionBitmask {
|
|||||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (var ngram : sent.ngramStemmed) {
|
||||||
|
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
|
||||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
|||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
@ -20,7 +21,9 @@ import java.util.Set;
|
|||||||
|
|
||||||
class DocumentKeywordExtractorTest {
|
class DocumentKeywordExtractorTest {
|
||||||
|
|
||||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||||
|
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||||
|
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||||
|
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||||
|
"Could not load word frequency table");
|
||||||
|
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||||
|
var doc = Jsoup.parse(html);
|
||||||
|
doc.filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
|
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||||
|
|
||||||
|
keywords.getWords().forEach((k, v) -> {
|
||||||
|
if (k.contains("_")) {
|
||||||
|
System.out.println(k + " " + new WordMetadata(v));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
public void testKeyboards() throws IOException, URISyntaxException {
|
public void testKeyboards() throws IOException, URISyntaxException {
|
||||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||||
@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest {
|
|||||||
var doc = Jsoup.parse(html);
|
var doc = Jsoup.parse(html);
|
||||||
doc.filter(new DomPruningFilter(0.5));
|
doc.filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||||
|
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||||
|
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
|
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.keyword;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
@Tag("slow")
|
@Tag("slow")
|
||||||
class SentenceExtractorTest {
|
class SentenceExtractorTest {
|
||||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(lm);
|
static NgramLexicon ngramLexicon = new NgramLexicon(lm);
|
||||||
|
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
@ -32,11 +34,9 @@ class SentenceExtractorTest {
|
|||||||
|
|
||||||
System.out.println("Running");
|
System.out.println("Running");
|
||||||
|
|
||||||
SentenceExtractor se = new SentenceExtractor(lm);
|
|
||||||
|
|
||||||
var dict = new TermFrequencyDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon);
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
long total = 0;
|
long total = 0;
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.summary.heuristic.*;
|
import nu.marginalia.summary.heuristic.*;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -25,7 +26,9 @@ class SummaryExtractorTest {
|
|||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
keywordExtractor = new DocumentKeywordExtractor(
|
||||||
|
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||||
|
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||||
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
summaryExtractor = new SummaryExtractor(255,
|
summaryExtractor = new SummaryExtractor(255,
|
||||||
|
@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
||||||
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
|||||||
public final String[] wordsLowerCase;
|
public final String[] wordsLowerCase;
|
||||||
public final String[] posTags;
|
public final String[] posTags;
|
||||||
public final String[] stemmedWords;
|
public final String[] stemmedWords;
|
||||||
|
public final String[] ngrams;
|
||||||
|
public final String[] ngramStemmed;
|
||||||
|
|
||||||
private final BitSet isStopWord;
|
private final BitSet isStopWord;
|
||||||
|
|
||||||
|
|
||||||
public SoftReference<WordSpan[]> keywords;
|
public SoftReference<WordSpan[]> keywords;
|
||||||
|
|
||||||
public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) {
|
public DocumentSentence(String originalSentence,
|
||||||
|
String[] words,
|
||||||
|
int[] separators,
|
||||||
|
String[] wordsLowerCase,
|
||||||
|
String[] posTags,
|
||||||
|
String[] stemmedWords,
|
||||||
|
String[] ngrams,
|
||||||
|
String[] ngramsStemmed
|
||||||
|
)
|
||||||
|
{
|
||||||
this.originalSentence = originalSentence;
|
this.originalSentence = originalSentence;
|
||||||
this.words = words;
|
this.words = words;
|
||||||
this.separators = separators;
|
this.separators = separators;
|
||||||
@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
|||||||
|
|
||||||
isStopWord = new BitSet(words.length);
|
isStopWord = new BitSet(words.length);
|
||||||
|
|
||||||
|
this.ngrams = ngrams;
|
||||||
|
this.ngramStemmed = ngramsStemmed;
|
||||||
|
|
||||||
for (int i = 0; i < words.length; i++) {
|
for (int i = 0; i < words.length; i++) {
|
||||||
if (WordPatterns.isStopWord(words[i]))
|
if (WordPatterns.isStopWord(words[i]))
|
||||||
isStopWord.set(i);
|
isStopWord.set(i);
|
||||||
|
@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger;
|
|||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import nu.marginalia.language.model.DocumentSentence;
|
import nu.marginalia.language.model.DocumentSentence;
|
||||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||||
@ -32,6 +33,8 @@ public class SentenceExtractor {
|
|||||||
private SentenceDetectorME sentenceDetector;
|
private SentenceDetectorME sentenceDetector;
|
||||||
private static RDRPOSTagger rdrposTagger;
|
private static RDRPOSTagger rdrposTagger;
|
||||||
|
|
||||||
|
private static NgramLexicon ngramLexicon = null;
|
||||||
|
|
||||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||||
|
|
||||||
@ -45,7 +48,8 @@ public class SentenceExtractor {
|
|||||||
private static final int MAX_TEXT_LENGTH = 65536;
|
private static final int MAX_TEXT_LENGTH = 65536;
|
||||||
|
|
||||||
@SneakyThrows @Inject
|
@SneakyThrows @Inject
|
||||||
public SentenceExtractor(LanguageModels models) {
|
public SentenceExtractor(LanguageModels models)
|
||||||
|
{
|
||||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||||
var sentenceModel = new SentenceModel(modelIn);
|
var sentenceModel = new SentenceModel(modelIn);
|
||||||
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
||||||
@ -55,7 +59,9 @@ public class SentenceExtractor {
|
|||||||
logger.error("Could not initialize sentence detector", ex);
|
logger.error("Could not initialize sentence detector", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
synchronized (RDRPOSTagger.class) {
|
synchronized (this) {
|
||||||
|
ngramLexicon = new NgramLexicon(models);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
||||||
}
|
}
|
||||||
@ -128,8 +134,34 @@ public class SentenceExtractor {
|
|||||||
var seps = wordsAndSeps.separators;
|
var seps = wordsAndSeps.separators;
|
||||||
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||||
|
|
||||||
|
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, words);
|
||||||
|
|
||||||
|
String[] ngramsWords = new String[ngrams.size()];
|
||||||
|
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||||
|
for (int i = 0; i < ngrams.size(); i++) {
|
||||||
|
String[] ngram = ngrams.get(i);
|
||||||
|
|
||||||
|
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||||
|
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||||
|
for (String s : ngram) {
|
||||||
|
ngramJoiner.add(s);
|
||||||
|
stemmedJoiner.add(porterStemmer.stem(s));
|
||||||
|
}
|
||||||
|
|
||||||
|
ngramsWords[i] = ngramJoiner.toString();
|
||||||
|
ngramsStemmedWords[i] = stemmedJoiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
return new DocumentSentence(
|
return new DocumentSentence(
|
||||||
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
SentenceExtractorStringUtils.sanitizeString(text),
|
||||||
|
words,
|
||||||
|
seps,
|
||||||
|
lc,
|
||||||
|
rdrposTagger.tagsForEnSentence(words),
|
||||||
|
stemSentence(lc),
|
||||||
|
ngramsWords,
|
||||||
|
ngramsStemmedWords
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,7 +227,35 @@ public class SentenceExtractor {
|
|||||||
fullString = "";
|
fullString = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]);
|
||||||
|
|
||||||
|
String[] ngramsWords = new String[ngrams.size()];
|
||||||
|
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||||
|
|
||||||
|
for (int j = 0; j < ngrams.size(); j++) {
|
||||||
|
String[] ngram = ngrams.get(j);
|
||||||
|
|
||||||
|
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||||
|
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||||
|
for (String s : ngram) {
|
||||||
|
ngramJoiner.add(s);
|
||||||
|
stemmedJoiner.add(porterStemmer.stem(s));
|
||||||
|
}
|
||||||
|
|
||||||
|
ngramsWords[j] = ngramJoiner.toString();
|
||||||
|
ngramsStemmedWords[j] = stemmedJoiner.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ret[i] = new DocumentSentence(fullString,
|
||||||
|
tokens[i],
|
||||||
|
separators[i],
|
||||||
|
tokensLc[i],
|
||||||
|
posTags[i],
|
||||||
|
stemmedWords[i],
|
||||||
|
ngramsWords,
|
||||||
|
ngramsStemmedWords
|
||||||
|
);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third-party:rdrpostagger')
|
implementation project(':third-party:rdrpostagger')
|
||||||
implementation project(':third-party:porterstemmer')
|
implementation project(':third-party:porterstemmer')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
implementation project(':third-party:openzim')
|
||||||
implementation project(':third-party:monkey-patch-opennlp')
|
implementation project(':third-party:monkey-patch-opennlp')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
|
||||||
/** A group of hash functions that can be used to hash a sequence of strings,
|
/** A group of hash functions that can be used to hash a sequence of strings,
|
||||||
* that also has an inverse operation that can be used to remove a previously applied
|
* that also has an inverse operation that can be used to remove a previously applied
|
||||||
* string from the sequence. */
|
* string from the sequence. */
|
||||||
sealed interface HasherGroup {
|
public sealed interface HasherGroup {
|
||||||
/** Apply a hash to the accumulator */
|
/** Apply a hash to the accumulator */
|
||||||
long apply(long acc, long add);
|
long apply(long acc, long add);
|
||||||
|
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -15,10 +14,11 @@ public class NgramExporterMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void trial() throws IOException {
|
static void trial() throws IOException {
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
NgramLexicon lexicon = new NgramLexicon(
|
||||||
|
LanguageModels.builder()
|
||||||
NgramLexicon lexicon = new NgramLexicon();
|
.segments(Path.of("/home/vlofgren/ngram-counts.bin"))
|
||||||
lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin"));
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
System.out.println("Loaded!");
|
System.out.println("Loaded!");
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.*;
|
import it.unimi.dsi.fastutil.longs.*;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
@ -1,11 +1,13 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
||||||
import it.unimi.dsi.fastutil.longs.LongHash;
|
import it.unimi.dsi.fastutil.longs.LongHash;
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -16,11 +18,9 @@ import java.util.ArrayList;
|
|||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class NgramLexicon {
|
public class NgramLexicon {
|
||||||
private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap(
|
private final Long2IntOpenCustomHashMap counts;
|
||||||
100_000_000,
|
|
||||||
new KeyIsAlreadyHashStrategy()
|
|
||||||
);
|
|
||||||
private final LongOpenHashSet permutations = new LongOpenHashSet();
|
private final LongOpenHashSet permutations = new LongOpenHashSet();
|
||||||
|
|
||||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||||
@ -28,15 +28,33 @@ public class NgramLexicon {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public NgramLexicon(LanguageModels models) {
|
public NgramLexicon(LanguageModels models) {
|
||||||
try {
|
try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) {
|
||||||
loadCounts(models.segments);
|
long size = dis.readInt();
|
||||||
|
counts = new Long2IntOpenCustomHashMap(
|
||||||
|
(int) size,
|
||||||
|
new KeyIsAlreadyHashStrategy()
|
||||||
|
);
|
||||||
|
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
counts.put(dis.readLong(), dis.readInt());
|
||||||
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public NgramLexicon() {
|
public NgramLexicon() {
|
||||||
|
counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String[]> findSegmentsStrings(int minLength, int maxLength, String... parts) {
|
||||||
|
List<SentenceSegment> segments = new ArrayList<>();
|
||||||
|
|
||||||
|
for (int i = minLength; i <= maxLength; i++) {
|
||||||
|
segments.addAll(findSegments(i, parts));
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments.stream().map(seg -> seg.project(parts)).toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<SentenceSegment> findSegments(int length, String... parts) {
|
public List<SentenceSegment> findSegments(int length, String... parts) {
|
||||||
@ -96,15 +114,6 @@ public class NgramLexicon {
|
|||||||
permutations.add(hashUnordered);
|
permutations.add(hashUnordered);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void loadCounts(Path path) throws IOException {
|
|
||||||
try (var dis = new DataInputStream(Files.newInputStream(path))) {
|
|
||||||
long size = dis.readInt();
|
|
||||||
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
counts.put(dis.readLong(), dis.readInt());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void loadPermutations(Path path) throws IOException {
|
public void loadPermutations(Path path) throws IOException {
|
||||||
try (var dis = new DataInputStream(Files.newInputStream(path))) {
|
try (var dis = new DataInputStream(Files.newInputStream(path))) {
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
|
import nu.marginalia.segmentation.HasherGroup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.segmentation;
|
||||||
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain;
|
|||||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.segmentation.NgramLexicon;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import nu.marginalia.tools.LegacyExperiment;
|
import nu.marginalia.tools.LegacyExperiment;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -21,8 +22,10 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
public class SentenceStatisticsExperiment extends LegacyExperiment {
|
public class SentenceStatisticsExperiment extends LegacyExperiment {
|
||||||
|
|
||||||
|
NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels());
|
||||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(
|
||||||
|
new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon);
|
||||||
Path filename;
|
Path filename;
|
||||||
PrintWriter writer;
|
PrintWriter writer;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user