diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index ca7fde45..d1854963 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -1,7 +1,10 @@ package nu.marginalia; +import lombok.Builder; + import java.nio.file.Path; +@Builder public class LanguageModels { public final Path termFrequencies; diff --git a/code/execution/build.gradle b/code/execution/build.gradle index fa455167..57de7320 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':third-party:commons-codec') implementation project(':code:libraries:message-queue') + implementation project(':code:libraries:term-frequency-dict') implementation project(':code:functions:link-graph:api') implementation project(':code:functions:search-query') diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java index 4cc4ca76..90baf009 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain; +import nu.marginalia.segmentation.NgramExtractorMain; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index 23e415b9..3541b5ec 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation project(':code:common:process') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') implementation libs.bundles.slf4j diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java index ee555ca5..17443c51 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.util.TestLanguageModels; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 8feb5fd8..aaad9800 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.keyword; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; @@ -15,11 +16,13 @@ public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; + private final NgramLexicon ngramLexicon; @Inject - public DocumentKeywordExtractor(TermFrequencyDict dict) { + public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { this.dict = dict; + this.ngramLexicon = ngramLexicon; this.keywordExtractor = new KeywordExtractor(); } @@ -131,6 +134,17 @@ public class DocumentKeywordExtractor { wordsBuilder.add(rep.word, meta); } + + for (int i = 0; i < sent.ngrams.length; i++) { + var ngram = sent.ngrams[i]; + var ngramStemmed = sent.ngramStemmed[i]; + + long meta = metadata.getMetadataForWord(ngramStemmed); + assert meta != 0L : "Missing meta for " + ngram; + + wordsBuilder.add(ngram, meta); + } + } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index b402c9f6..230c895f 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -14,7 +14,9 @@ public class KeywordPositionBitmask { private static final int unmodulatedPortion = 16; @Inject - public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) { + public KeywordPositionBitmask(KeywordExtractor keywordExtractor, + DocumentLanguageData dld) + { // Mark the title words as position 0 for (var sent : dld.titleSentences) { @@ -24,6 +26,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -43,6 +49,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 8a4f3b6b..54577f80 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; @@ -20,7 +21,9 @@ import java.util.Set; class DocumentKeywordExtractorTest { - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); @Test @@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest { } + @Test + public void testKeyboards2() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); + + keywords.getWords().forEach((k, v) -> { + if (k.contains("_")) { + System.out.println(k + " " + new WordMetadata(v)); + } + }); + } @Test public void testKeyboards() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), @@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest { var doc = Jsoup.parse(html); doc.filter(new DomPruningFilter(0.5)); - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index dabad6d1..bfc78a9c 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -3,6 +3,7 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; @@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("slow") class SentenceExtractorTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); + static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - SentenceExtractor se = new SentenceExtractor(lm); + static NgramLexicon ngramLexicon = new NgramLexicon(lm); + static SentenceExtractor se = new SentenceExtractor(lm); @SneakyThrows public static void main(String... args) throws IOException { @@ -32,11 +34,9 @@ class SentenceExtractorTest { System.out.println("Running"); - SentenceExtractor se = new SentenceExtractor(lm); - var dict = new TermFrequencyDict(lm); var url = new EdgeUrl("https://memex.marginalia.nu/"); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); for (;;) { long total = 0; diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java index c1a326da..cabe558f 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.summary.heuristic.*; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; @@ -25,7 +26,9 @@ class SummaryExtractorTest { @BeforeEach public void setUp() { - keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + keywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); summaryExtractor = new SummaryExtractor(255, diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index eac2988d..820a9022 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; -import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index ef5bc0a9..b9b4abce 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable{ public final String[] wordsLowerCase; public final String[] posTags; public final String[] stemmedWords; + public final String[] ngrams; + public final String[] ngramStemmed; private final BitSet isStopWord; + public SoftReference keywords; - public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) { + public DocumentSentence(String originalSentence, + String[] words, + int[] separators, + String[] wordsLowerCase, + String[] posTags, + String[] stemmedWords, + String[] ngrams, + String[] ngramsStemmed + ) + { this.originalSentence = originalSentence; this.words = words; this.separators = separators; @@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable{ isStopWord = new BitSet(words.length); + this.ngrams = ngrams; + this.ngramStemmed = ngramsStemmed; + for (int i = 0; i < words.length; i++) { if (WordPatterns.isStopWord(words[i])) isStopWord.set(i); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 13ba2e76..fd15660f 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import opennlp.tools.sentdetect.SentenceDetectorME; @@ -32,6 +33,8 @@ public class SentenceExtractor { private SentenceDetectorME sentenceDetector; private static RDRPOSTagger rdrposTagger; + private static NgramLexicon ngramLexicon = null; + private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); @@ -45,7 +48,8 @@ public class SentenceExtractor { private static final int MAX_TEXT_LENGTH = 65536; @SneakyThrows @Inject - public SentenceExtractor(LanguageModels models) { + public SentenceExtractor(LanguageModels models) + { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { var sentenceModel = new SentenceModel(modelIn); sentenceDetector = new SentenceDetectorME(sentenceModel); @@ -55,7 +59,9 @@ public class SentenceExtractor { logger.error("Could not initialize sentence detector", ex); } - synchronized (RDRPOSTagger.class) { + synchronized (this) { + ngramLexicon = new NgramLexicon(models); + try { rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); } @@ -128,8 +134,34 @@ public class SentenceExtractor { var seps = wordsAndSeps.separators; var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + for (int i = 0; i < ngrams.size(); i++) { + String[] ngram = ngrams.get(i); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[i] = ngramJoiner.toString(); + ngramsStemmedWords[i] = stemmedJoiner.toString(); + } + + return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + SentenceExtractorStringUtils.sanitizeString(text), + words, + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemSentence(lc), + ngramsWords, + ngramsStemmedWords ); } @@ -195,7 +227,35 @@ public class SentenceExtractor { fullString = ""; } - ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + + for (int j = 0; j < ngrams.size(); j++) { + String[] ngram = ngrams.get(j); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[j] = ngramJoiner.toString(); + ngramsStemmedWords[j] = stemmedJoiner.toString(); + } + + + ret[i] = new DocumentSentence(fullString, + tokens[i], + separators[i], + tokensLc[i], + posTags[i], + stemmedWords[i], + ngramsWords, + ngramsStemmedWords + ); } return ret; } diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 0fe311b6..4d7e42c5 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':third-party:rdrpostagger') implementation project(':third-party:porterstemmer') + implementation project(':third-party:commons-codec') + implementation project(':third-party:openzim') implementation project(':third-party:monkey-patch-opennlp') implementation project(':code:common:model') implementation project(':code:common:config') diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java similarity index 88% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java index e65c243d..cee48910 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import ca.rmen.porterstemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java similarity index 95% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java index 60bbb4dd..2a452f75 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java @@ -1,11 +1,11 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import nu.marginalia.hash.MurmurHash3_128; /** A group of hash functions that can be used to hash a sequence of strings, * that also has an inverse operation that can be used to remove a previously applied * string from the sequence. */ -sealed interface HasherGroup { +public sealed interface HasherGroup { /** Apply a hash to the accumulator */ long apply(long acc, long add); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java similarity index 72% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java index 087345f6..ee6d2cd5 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java @@ -1,7 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; -import nu.marginalia.WmsaHome; -import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.LanguageModels; import java.io.IOException; import java.nio.file.Path; @@ -15,10 +14,11 @@ public class NgramExporterMain { } static void trial() throws IOException { - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - - NgramLexicon lexicon = new NgramLexicon(); - lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin")); + NgramLexicon lexicon = new NgramLexicon( + LanguageModels.builder() + .segments(Path.of("/home/vlofgren/ngram-counts.bin")) + .build() + ); System.out.println("Loaded!"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java similarity index 98% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 4cd4b296..577aee6e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.hash.MurmurHash3_128; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java similarity index 85% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index c4fe69e2..91cee314 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -1,11 +1,13 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import com.google.inject.Inject; +import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.LanguageModels; +import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; @@ -16,11 +18,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +@Singleton public class NgramLexicon { - private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( - 100_000_000, - new KeyIsAlreadyHashStrategy() - ); + private final Long2IntOpenCustomHashMap counts; private final LongOpenHashSet permutations = new LongOpenHashSet(); private static final HasherGroup orderedHasher = HasherGroup.ordered(); @@ -28,17 +28,35 @@ public class NgramLexicon { @Inject public NgramLexicon(LanguageModels models) { - try { - loadCounts(models.segments); + try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) { + long size = dis.readInt(); + counts = new Long2IntOpenCustomHashMap( + (int) size, + new KeyIsAlreadyHashStrategy() + ); + + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } } catch (IOException e) { throw new RuntimeException(e); } } public NgramLexicon() { - + counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); } + public List findSegmentsStrings(int minLength, int maxLength, String... parts) { + List segments = new ArrayList<>(); + + for (int i = minLength; i <= maxLength; i++) { + segments.addAll(findSegments(i, parts)); + } + + return segments.stream().map(seg -> seg.project(parts)).toList(); + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); @@ -96,15 +114,6 @@ public class NgramLexicon { permutations.add(hashUnordered); } - public void loadCounts(Path path) throws IOException { - try (var dis = new DataInputStream(Files.newInputStream(path))) { - long size = dis.readInt(); - - for (int i = 0; i < size; i++) { - counts.put(dis.readLong(), dis.readInt()); - } - } - } public void loadPermutations(Path path) throws IOException { try (var dis = new DataInputStream(Files.newInputStream(path))) { diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java similarity index 89% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java index 174bd553..110b1b9b 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; +import nu.marginalia.segmentation.HasherGroup; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java similarity index 96% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index 28b9ef2f..d5065959 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 8614d1e6..dde7a106 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; @@ -21,8 +22,10 @@ import java.nio.file.Path; public class SentenceStatisticsExperiment extends LegacyExperiment { + NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon); Path filename; PrintWriter writer;