mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(convert) Initial integration of segmentation data into the converter's keyword extraction logic
This commit is contained in:
parent
d8f4e7d72b
commit
0bd3365c24
@ -1,7 +1,10 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Builder
|
||||
public class LanguageModels {
|
||||
public final Path termFrequencies;
|
||||
|
||||
|
@ -32,6 +32,7 @@ dependencies {
|
||||
implementation project(':third-party:commons-codec')
|
||||
|
||||
implementation project(':code:libraries:message-queue')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:link-graph:api')
|
||||
implementation project(':code:functions:search-query')
|
||||
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain;
|
||||
import nu.marginalia.segmentation.NgramExtractorMain;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -19,6 +19,7 @@ dependencies {
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:features-convert:keyword-extraction')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.keyword.extractors.*;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -15,11 +16,13 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final NgramLexicon ngramLexicon;
|
||||
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) {
|
||||
this.dict = dict;
|
||||
this.ngramLexicon = ngramLexicon;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
|
||||
@ -131,6 +134,17 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
wordsBuilder.add(rep.word, meta);
|
||||
}
|
||||
|
||||
for (int i = 0; i < sent.ngrams.length; i++) {
|
||||
var ngram = sent.ngrams[i];
|
||||
var ngramStemmed = sent.ngramStemmed[i];
|
||||
|
||||
long meta = metadata.getMetadataForWord(ngramStemmed);
|
||||
assert meta != 0L : "Missing meta for " + ngram;
|
||||
|
||||
wordsBuilder.add(ngram, meta);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14,7 +14,9 @@ public class KeywordPositionBitmask {
|
||||
private static final int unmodulatedPortion = 16;
|
||||
|
||||
@Inject
|
||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) {
|
||||
public KeywordPositionBitmask(KeywordExtractor keywordExtractor,
|
||||
DocumentLanguageData dld)
|
||||
{
|
||||
|
||||
// Mark the title words as position 0
|
||||
for (var sent : dld.titleSentences) {
|
||||
@ -24,6 +26,10 @@ public class KeywordPositionBitmask {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
@ -43,6 +49,10 @@ public class KeywordPositionBitmask {
|
||||
positionMask.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var ngram : sent.ngramStemmed) {
|
||||
positionMask.merge(ngram, posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getKeywordsFromSentence(sent)) {
|
||||
positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@ -20,7 +21,9 @@ import java.util.Set;
|
||||
|
||||
class DocumentKeywordExtractorTest {
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
@Test
|
||||
@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testKeyboards2() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
|
||||
|
||||
keywords.getWords().forEach((k, v) -> {
|
||||
if (k.contains("_")) {
|
||||
System.out.println(k + " " + new WordMetadata(v));
|
||||
}
|
||||
});
|
||||
}
|
||||
@Test
|
||||
public void testKeyboards() throws IOException, URISyntaxException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
|
||||
@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest {
|
||||
var doc = Jsoup.parse(html);
|
||||
doc.filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online"));
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.keyword;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@Tag("slow")
|
||||
class SentenceExtractorTest {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
static final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
static NgramLexicon ngramLexicon = new NgramLexicon(lm);
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
@ -32,11 +34,9 @@ class SentenceExtractorTest {
|
||||
|
||||
System.out.println("Running");
|
||||
|
||||
SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
var url = new EdgeUrl("https://memex.marginalia.nu/");
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict);
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon);
|
||||
|
||||
for (;;) {
|
||||
long total = 0;
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.summary.heuristic.*;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.jsoup.Jsoup;
|
||||
@ -25,7 +26,9 @@ class SummaryExtractorTest {
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
keywordExtractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()),
|
||||
new NgramLexicon(WmsaHome.getLanguageModels()));
|
||||
setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
summaryExtractor = new SummaryExtractor(255,
|
||||
|
@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
||||
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
public final String[] wordsLowerCase;
|
||||
public final String[] posTags;
|
||||
public final String[] stemmedWords;
|
||||
public final String[] ngrams;
|
||||
public final String[] ngramStemmed;
|
||||
|
||||
private final BitSet isStopWord;
|
||||
|
||||
|
||||
public SoftReference<WordSpan[]> keywords;
|
||||
|
||||
public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) {
|
||||
public DocumentSentence(String originalSentence,
|
||||
String[] words,
|
||||
int[] separators,
|
||||
String[] wordsLowerCase,
|
||||
String[] posTags,
|
||||
String[] stemmedWords,
|
||||
String[] ngrams,
|
||||
String[] ngramsStemmed
|
||||
)
|
||||
{
|
||||
this.originalSentence = originalSentence;
|
||||
this.words = words;
|
||||
this.separators = separators;
|
||||
@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable<DocumentSentence.SentencePos>{
|
||||
|
||||
isStopWord = new BitSet(words.length);
|
||||
|
||||
this.ngrams = ngrams;
|
||||
this.ngramStemmed = ngramsStemmed;
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
if (WordPatterns.isStopWord(words[i]))
|
||||
isStopWord.set(i);
|
||||
|
@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
@ -32,6 +33,8 @@ public class SentenceExtractor {
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private static RDRPOSTagger rdrposTagger;
|
||||
|
||||
private static NgramLexicon ngramLexicon = null;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
@ -45,7 +48,8 @@ public class SentenceExtractor {
|
||||
private static final int MAX_TEXT_LENGTH = 65536;
|
||||
|
||||
@SneakyThrows @Inject
|
||||
public SentenceExtractor(LanguageModels models) {
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
{
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
var sentenceModel = new SentenceModel(modelIn);
|
||||
sentenceDetector = new SentenceDetectorME(sentenceModel);
|
||||
@ -55,7 +59,9 @@ public class SentenceExtractor {
|
||||
logger.error("Could not initialize sentence detector", ex);
|
||||
}
|
||||
|
||||
synchronized (RDRPOSTagger.class) {
|
||||
synchronized (this) {
|
||||
ngramLexicon = new NgramLexicon(models);
|
||||
|
||||
try {
|
||||
rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules);
|
||||
}
|
||||
@ -128,8 +134,34 @@ public class SentenceExtractor {
|
||||
var seps = wordsAndSeps.separators;
|
||||
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||
|
||||
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, words);
|
||||
|
||||
String[] ngramsWords = new String[ngrams.size()];
|
||||
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||
for (int i = 0; i < ngrams.size(); i++) {
|
||||
String[] ngram = ngrams.get(i);
|
||||
|
||||
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||
for (String s : ngram) {
|
||||
ngramJoiner.add(s);
|
||||
stemmedJoiner.add(porterStemmer.stem(s));
|
||||
}
|
||||
|
||||
ngramsWords[i] = ngramJoiner.toString();
|
||||
ngramsStemmedWords[i] = stemmedJoiner.toString();
|
||||
}
|
||||
|
||||
|
||||
return new DocumentSentence(
|
||||
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
SentenceExtractorStringUtils.sanitizeString(text),
|
||||
words,
|
||||
seps,
|
||||
lc,
|
||||
rdrposTagger.tagsForEnSentence(words),
|
||||
stemSentence(lc),
|
||||
ngramsWords,
|
||||
ngramsStemmedWords
|
||||
);
|
||||
}
|
||||
|
||||
@ -195,7 +227,35 @@ public class SentenceExtractor {
|
||||
fullString = "";
|
||||
}
|
||||
|
||||
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
List<String[]> ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]);
|
||||
|
||||
String[] ngramsWords = new String[ngrams.size()];
|
||||
String[] ngramsStemmedWords = new String[ngrams.size()];
|
||||
|
||||
for (int j = 0; j < ngrams.size(); j++) {
|
||||
String[] ngram = ngrams.get(j);
|
||||
|
||||
StringJoiner ngramJoiner = new StringJoiner("_");
|
||||
StringJoiner stemmedJoiner = new StringJoiner("_");
|
||||
for (String s : ngram) {
|
||||
ngramJoiner.add(s);
|
||||
stemmedJoiner.add(porterStemmer.stem(s));
|
||||
}
|
||||
|
||||
ngramsWords[j] = ngramJoiner.toString();
|
||||
ngramsStemmedWords[j] = stemmedJoiner.toString();
|
||||
}
|
||||
|
||||
|
||||
ret[i] = new DocumentSentence(fullString,
|
||||
tokens[i],
|
||||
separators[i],
|
||||
tokensLc[i],
|
||||
posTags[i],
|
||||
stemmedWords[i],
|
||||
ngramsWords,
|
||||
ngramsStemmedWords
|
||||
);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
dependencies {
|
||||
implementation project(':third-party:rdrpostagger')
|
||||
implementation project(':third-party:porterstemmer')
|
||||
implementation project(':third-party:commons-codec')
|
||||
implementation project(':third-party:openzim')
|
||||
implementation project(':third-party:monkey-patch-opennlp')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import org.apache.commons.lang3.StringUtils;
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
|
||||
/** A group of hash functions that can be used to hash a sequence of strings,
|
||||
* that also has an inverse operation that can be used to remove a previously applied
|
||||
* string from the sequence. */
|
||||
sealed interface HasherGroup {
|
||||
public sealed interface HasherGroup {
|
||||
/** Apply a hash to the accumulator */
|
||||
long apply(long acc, long add);
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.LanguageModels;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
@ -15,10 +14,11 @@ public class NgramExporterMain {
|
||||
}
|
||||
|
||||
static void trial() throws IOException {
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
|
||||
NgramLexicon lexicon = new NgramLexicon();
|
||||
lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin"));
|
||||
NgramLexicon lexicon = new NgramLexicon(
|
||||
LanguageModels.builder()
|
||||
.segments(Path.of("/home/vlofgren/ngram-counts.bin"))
|
||||
.build()
|
||||
);
|
||||
|
||||
System.out.println("Loaded!");
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
@ -1,11 +1,13 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongHash;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.LanguageModels;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
@ -16,11 +18,9 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class NgramLexicon {
|
||||
private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap(
|
||||
100_000_000,
|
||||
new KeyIsAlreadyHashStrategy()
|
||||
);
|
||||
private final Long2IntOpenCustomHashMap counts;
|
||||
private final LongOpenHashSet permutations = new LongOpenHashSet();
|
||||
|
||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||
@ -28,17 +28,35 @@ public class NgramLexicon {
|
||||
|
||||
@Inject
|
||||
public NgramLexicon(LanguageModels models) {
|
||||
try {
|
||||
loadCounts(models.segments);
|
||||
try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) {
|
||||
long size = dis.readInt();
|
||||
counts = new Long2IntOpenCustomHashMap(
|
||||
(int) size,
|
||||
new KeyIsAlreadyHashStrategy()
|
||||
);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
counts.put(dis.readLong(), dis.readInt());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public NgramLexicon() {
|
||||
|
||||
counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy());
|
||||
}
|
||||
|
||||
public List<String[]> findSegmentsStrings(int minLength, int maxLength, String... parts) {
|
||||
List<SentenceSegment> segments = new ArrayList<>();
|
||||
|
||||
for (int i = minLength; i <= maxLength; i++) {
|
||||
segments.addAll(findSegments(i, parts));
|
||||
}
|
||||
|
||||
return segments.stream().map(seg -> seg.project(parts)).toList();
|
||||
}
|
||||
|
||||
public List<SentenceSegment> findSegments(int length, String... parts) {
|
||||
// Don't look for ngrams longer than the sentence
|
||||
if (parts.length < length) return List.of();
|
||||
@ -96,15 +114,6 @@ public class NgramLexicon {
|
||||
permutations.add(hashUnordered);
|
||||
}
|
||||
|
||||
public void loadCounts(Path path) throws IOException {
|
||||
try (var dis = new DataInputStream(Files.newInputStream(path))) {
|
||||
long size = dis.readInt();
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
counts.put(dis.readLong(), dis.readInt());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void loadPermutations(Path path) throws IOException {
|
||||
try (var dis = new DataInputStream(Files.newInputStream(path))) {
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import nu.marginalia.segmentation.HasherGroup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
package nu.marginalia.segmentation;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.tools.LegacyExperiment;
|
||||
import org.jsoup.Jsoup;
|
||||
@ -21,8 +22,10 @@ import java.nio.file.Path;
|
||||
|
||||
public class SentenceStatisticsExperiment extends LegacyExperiment {
|
||||
|
||||
NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels());
|
||||
SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
|
||||
DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(
|
||||
new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon);
|
||||
Path filename;
|
||||
PrintWriter writer;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user