diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index 04ab0aa0..ca7fde45 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -3,7 +3,6 @@ package nu.marginalia; import java.nio.file.Path; public class LanguageModels { - public final Path ngramBloomFilter; public final Path termFrequencies; public final Path openNLPSentenceDetectionData; @@ -11,20 +10,21 @@ public class LanguageModels { public final Path posDict; public final Path openNLPTokenData; public final Path fasttextLanguageModel; + public final Path segments; - public LanguageModels(Path ngramBloomFilter, - Path termFrequencies, + public LanguageModels(Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData, - Path fasttextLanguageModel) { - this.ngramBloomFilter = ngramBloomFilter; + Path fasttextLanguageModel, + Path segments) { this.termFrequencies = termFrequencies; this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; this.posRules = posRules; this.posDict = posDict; this.openNLPTokenData = openNLPTokenData; this.fasttextLanguageModel = fasttextLanguageModel; + this.segments = segments; } } diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index b61ee4dd..b5378afc 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -85,13 +85,14 @@ public class WmsaHome { final Path home = getHomePath(); return new LanguageModels( - home.resolve("model/ngrams.bin"), home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/opennlp-sentence.bin"), home.resolve("model/English.RDR"), home.resolve("model/English.DICT"), home.resolve("model/opennlp-tok.bin"), - home.resolve("model/lid.176.ftz")); + home.resolve("model/lid.176.ftz"), + home.resolve("model/segments.bin") + ); } public static Path getAtagsPath() { diff --git a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java index 68ad426a..3c5a8d5b 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java @@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*; import nu.marginalia.functions.execution.api.*; import nu.marginalia.storage.model.FileStorageId; -import java.nio.file.Path; - @Singleton public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { private final ExecutorActorControlService actorControlService; diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java index 0675559a..d857c048 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index faac81d4..eac2988d 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -1,6 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser.variant; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; @@ -15,13 +16,15 @@ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; private final NgramLexicon lexicon; - List expansionStrategies = List.of( + + private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, this::joinTerms, this::createSegments ); + @Inject public QueryExpansion(TermFrequencyDict dict, NgramLexicon lexicon ) { @@ -97,6 +100,7 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { for (var segment : lexicon.findSegments(length, words)) { int start = segment.start(); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java deleted file mode 100644 index c24defbe..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that combines word that have dashes, as sometimes lawn-chair - * gets spelled lawnchair */ -public class CombineDashes implements VariantStrategy { - final Pattern dashBoundary = Pattern.compile("-"); - - public CombineDashes() { - } - - @Override - public Collection> constructVariants(List words) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : words) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find()) { - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - - asTokens2.add(span.word); - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java deleted file mode 100644 index d03a64d1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java +++ /dev/null @@ -1,58 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import ca.rmen.porterstemmer.PorterStemmer; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** Variant strategy that merges tokens that are adjacent, where the combined token - * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ -public class JoinTerms implements VariantStrategy { - private final TermFrequencyDict dict; - private final PorterStemmer ps; - - public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { - this.dict = dict; - this.ps = ps; - } - - @Override - public Collection> constructVariants(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - - } - - return ret; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java deleted file mode 100644 index ac79476b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java +++ /dev/null @@ -1,65 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.util.ngrams.NGramBloomFilter; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that splits tokens at the boundary between a number and a word. - */ -public class SplitWordNum implements VariantStrategy { - - - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - private final NGramBloomFilter nGramBloomFilter; - - public SplitWordNum(NGramBloomFilter nGramBloomFilter) { - this.nGramBloomFilter = nGramBloomFilter; - } - - @Override - public Collection> constructVariants(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index f8044e12..c4fe69e2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -1,8 +1,10 @@ package nu.marginalia.functions.searchquery.segmentation; +import com.google.inject.Inject; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import nu.marginalia.LanguageModels; import java.io.DataInputStream; import java.io.DataOutputStream; @@ -24,6 +26,19 @@ public class NgramLexicon { private static final HasherGroup orderedHasher = HasherGroup.ordered(); private static final HasherGroup unorderedHasher = HasherGroup.unordered(); + @Inject + public NgramLexicon(LanguageModels models) { + try { + loadCounts(models.segments); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public NgramLexicon() { + + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java deleted file mode 100644 index 008b17b3..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.util.ngrams; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.BitSet; - -// It's unclear why this exists, we should probably use a BitSet instead? -// Chesterton's fence? -public class DenseBitMap { - public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; - - public final long cardinality; - private final ByteBuffer buffer; - - public DenseBitMap(long cardinality) { - this.cardinality = cardinality; - - boolean misaligned = (cardinality & 7) > 0; - this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); - } - - public static DenseBitMap loadFromFile(Path file) throws IOException { - long size = Files.size(file); - var dbm = new DenseBitMap(size/8); - - try (var bc = Files.newByteChannel(file)) { - while (dbm.buffer.position() < dbm.buffer.capacity()) { - bc.read(dbm.buffer); - } - } - dbm.buffer.clear(); - - return dbm; - } - - public void writeToFile(Path file) throws IOException { - - try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { - while (buffer.position() < buffer.capacity()) { - bc.write(buffer); - } - } - - buffer.clear(); - } - - public boolean get(long pos) { - return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; - } - - /** Set the bit indexed by pos, returns - * its previous value. - */ - public boolean set(long pos) { - int offset = (int) (pos >>> 3); - int oldVal = buffer.get(offset); - int mask = (byte) 1 << (int) (pos & 7); - buffer.put(offset, (byte) (oldVal | mask)); - return (oldVal & mask) != 0; - } - - public void clear(long pos) { - int offset = (int)(pos >>> 3); - buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7)))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java deleted file mode 100644 index 3326956d..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.util.ngrams; - -import ca.rmen.porterstemmer.PorterStemmer; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.inject.Inject; -import nu.marginalia.LanguageModels; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.regex.Pattern; - -public class NGramBloomFilter { - private final DenseBitMap bitMap; - private static final PorterStemmer ps = new PorterStemmer(); - private static final HashFunction hasher = Hashing.murmur3_128(0); - - private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class); - - @Inject - public NGramBloomFilter(LanguageModels lm) throws IOException { - this(loadSafely(lm.ngramBloomFilter)); - } - - private static DenseBitMap loadSafely(Path path) throws IOException { - if (Files.isRegularFile(path)) { - return DenseBitMap.loadFromFile(path); - } - else { - logger.warn("NGrams file missing " + path); - return new DenseBitMap(1); - } - } - - public NGramBloomFilter(DenseBitMap bitMap) { - this.bitMap = bitMap; - } - - public boolean isKnownNGram(String word) { - long bit = bitForWord(word, bitMap.cardinality); - - return bitMap.get(bit); - } - - public static NGramBloomFilter load(Path file) throws IOException { - return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); - } - - private static final Pattern underscore = Pattern.compile("_"); - - private static long bitForWord(String s, long n) { - String[] parts = underscore.split(s); - long hc = 0; - for (String part : parts) { - hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong(); - } - return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n; - } - -} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 4020d6e0..24131143 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; diff --git a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java index 2b7bf0e2..cb31942a 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java index 4ad1e430..f28e1348 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/run/setup.sh b/run/setup.sh index 3d9c5f54..3cacca75 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin -download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin +download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz