mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(qs) Retire NGramBloomFilter, integrate new segmentation model instead
This commit is contained in:
parent
afc047cd27
commit
d8f4e7d72b
@ -3,7 +3,6 @@ package nu.marginalia;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class LanguageModels {
|
public class LanguageModels {
|
||||||
public final Path ngramBloomFilter;
|
|
||||||
public final Path termFrequencies;
|
public final Path termFrequencies;
|
||||||
|
|
||||||
public final Path openNLPSentenceDetectionData;
|
public final Path openNLPSentenceDetectionData;
|
||||||
@ -11,20 +10,21 @@ public class LanguageModels {
|
|||||||
public final Path posDict;
|
public final Path posDict;
|
||||||
public final Path openNLPTokenData;
|
public final Path openNLPTokenData;
|
||||||
public final Path fasttextLanguageModel;
|
public final Path fasttextLanguageModel;
|
||||||
|
public final Path segments;
|
||||||
|
|
||||||
public LanguageModels(Path ngramBloomFilter,
|
public LanguageModels(Path termFrequencies,
|
||||||
Path termFrequencies,
|
|
||||||
Path openNLPSentenceDetectionData,
|
Path openNLPSentenceDetectionData,
|
||||||
Path posRules,
|
Path posRules,
|
||||||
Path posDict,
|
Path posDict,
|
||||||
Path openNLPTokenData,
|
Path openNLPTokenData,
|
||||||
Path fasttextLanguageModel) {
|
Path fasttextLanguageModel,
|
||||||
this.ngramBloomFilter = ngramBloomFilter;
|
Path segments) {
|
||||||
this.termFrequencies = termFrequencies;
|
this.termFrequencies = termFrequencies;
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||||
this.posRules = posRules;
|
this.posRules = posRules;
|
||||||
this.posDict = posDict;
|
this.posDict = posDict;
|
||||||
this.openNLPTokenData = openNLPTokenData;
|
this.openNLPTokenData = openNLPTokenData;
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -85,13 +85,14 @@ public class WmsaHome {
|
|||||||
final Path home = getHomePath();
|
final Path home = getHomePath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
home.resolve("model/ngrams.bin"),
|
|
||||||
home.resolve("model/tfreq-new-algo3.bin"),
|
home.resolve("model/tfreq-new-algo3.bin"),
|
||||||
home.resolve("model/opennlp-sentence.bin"),
|
home.resolve("model/opennlp-sentence.bin"),
|
||||||
home.resolve("model/English.RDR"),
|
home.resolve("model/English.RDR"),
|
||||||
home.resolve("model/English.DICT"),
|
home.resolve("model/English.DICT"),
|
||||||
home.resolve("model/opennlp-tok.bin"),
|
home.resolve("model/opennlp-tok.bin"),
|
||||||
home.resolve("model/lid.176.ftz"));
|
home.resolve("model/lid.176.ftz"),
|
||||||
|
home.resolve("model/segments.bin")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Path getAtagsPath() {
|
public static Path getAtagsPath() {
|
||||||
|
@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*;
|
|||||||
import nu.marginalia.functions.execution.api.*;
|
import nu.marginalia.functions.execution.api.*;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase {
|
public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase {
|
||||||
private final ExecutorActorControlService actorControlService;
|
private final ExecutorActorControlService actorControlService;
|
||||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams.bin"),
|
|
||||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
languageModelsHome.resolve("lid.176.ftz")
|
languageModelsHome.resolve("lid.176.ftz"),
|
||||||
|
languageModelsHome.resolve("segments.bin")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams.bin"),
|
|
||||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
languageModelsHome.resolve("lid.176.ftz")
|
languageModelsHome.resolve("lid.176.ftz"),
|
||||||
|
languageModelsHome.resolve("segments.bin")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
package nu.marginalia.functions.searchquery.query_parser.variant;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
||||||
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
|
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
|
||||||
@ -15,13 +16,15 @@ public class QueryExpansion {
|
|||||||
private static final PorterStemmer ps = new PorterStemmer();
|
private static final PorterStemmer ps = new PorterStemmer();
|
||||||
private final TermFrequencyDict dict;
|
private final TermFrequencyDict dict;
|
||||||
private final NgramLexicon lexicon;
|
private final NgramLexicon lexicon;
|
||||||
List<ExpansionStrategy> expansionStrategies = List.of(
|
|
||||||
|
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
||||||
this::joinDashes,
|
this::joinDashes,
|
||||||
this::splitWordNum,
|
this::splitWordNum,
|
||||||
this::joinTerms,
|
this::joinTerms,
|
||||||
this::createSegments
|
this::createSegments
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@Inject
|
||||||
public QueryExpansion(TermFrequencyDict dict,
|
public QueryExpansion(TermFrequencyDict dict,
|
||||||
NgramLexicon lexicon
|
NgramLexicon lexicon
|
||||||
) {
|
) {
|
||||||
@ -97,6 +100,7 @@ public class QueryExpansion {
|
|||||||
|
|
||||||
String[] words = nodes.stream().map(QWord::word).toArray(String[]::new);
|
String[] words = nodes.stream().map(QWord::word).toArray(String[]::new);
|
||||||
|
|
||||||
|
// Look for known segments within the query
|
||||||
for (int length = 2; length < Math.min(10, words.length); length++) {
|
for (int length = 2; length < Math.min(10, words.length); length++) {
|
||||||
for (var segment : lexicon.findSegments(length, words)) {
|
for (var segment : lexicon.findSegments(length, words)) {
|
||||||
int start = segment.start();
|
int start = segment.start();
|
||||||
|
@ -1,40 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
|
||||||
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
/** Variant strategy that combines word that have dashes, as sometimes lawn-chair
|
|
||||||
* gets spelled lawnchair */
|
|
||||||
public class CombineDashes implements VariantStrategy {
|
|
||||||
final Pattern dashBoundary = Pattern.compile("-");
|
|
||||||
|
|
||||||
public CombineDashes() {
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> words) {
|
|
||||||
List<String> asTokens2 = new ArrayList<>();
|
|
||||||
boolean dash = false;
|
|
||||||
|
|
||||||
for (var span : words) {
|
|
||||||
var matcher = dashBoundary.matcher(span.word);
|
|
||||||
if (matcher.find()) {
|
|
||||||
String combined = dashBoundary.matcher(span.word).replaceAll("");
|
|
||||||
asTokens2.add(combined);
|
|
||||||
}
|
|
||||||
|
|
||||||
asTokens2.add(span.word);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dash) {
|
|
||||||
return List.of(asTokens2);
|
|
||||||
}
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,58 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/** Variant strategy that merges tokens that are adjacent, where the combined token
|
|
||||||
* has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */
|
|
||||||
public class JoinTerms implements VariantStrategy {
|
|
||||||
private final TermFrequencyDict dict;
|
|
||||||
private final PorterStemmer ps;
|
|
||||||
|
|
||||||
public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) {
|
|
||||||
this.dict = dict;
|
|
||||||
this.ps = ps;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> span) {
|
|
||||||
List<List<String>> ret = new ArrayList<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < span.size()-1; i++) {
|
|
||||||
var a = span.get(i);
|
|
||||||
var b = span.get(i+1);
|
|
||||||
|
|
||||||
var stemmed = ps.stemWord(a.word + b.word);
|
|
||||||
|
|
||||||
double scoreCombo = dict.getTermFreqStemmed(stemmed);
|
|
||||||
|
|
||||||
if (scoreCombo > 10000) {
|
|
||||||
List<String> asTokens = new ArrayList<>();
|
|
||||||
|
|
||||||
for (int j = 0; j < i; j++) {
|
|
||||||
var word = span.get(j).word;
|
|
||||||
asTokens.add(word);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
var word = a.word + b.word;
|
|
||||||
asTokens.add(word);
|
|
||||||
}
|
|
||||||
for (int j = i+2; j < span.size(); j++) {
|
|
||||||
var word = span.get(j).word;
|
|
||||||
asTokens.add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.add(asTokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,65 +0,0 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
|
||||||
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
|
||||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
|
||||||
import nu.marginalia.util.ngrams.NGramBloomFilter;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
/** Variant strategy that splits tokens at the boundary between a number and a word.
|
|
||||||
*/
|
|
||||||
public class SplitWordNum implements VariantStrategy {
|
|
||||||
|
|
||||||
|
|
||||||
final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
|
|
||||||
private final NGramBloomFilter nGramBloomFilter;
|
|
||||||
|
|
||||||
public SplitWordNum(NGramBloomFilter nGramBloomFilter) {
|
|
||||||
this.nGramBloomFilter = nGramBloomFilter;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> ls) {
|
|
||||||
List<String> asTokens2 = new ArrayList<>();
|
|
||||||
|
|
||||||
boolean num = false;
|
|
||||||
|
|
||||||
for (var span : ls) {
|
|
||||||
var wordMatcher = numWordBoundary.matcher(span.word);
|
|
||||||
var stemmedMatcher = numWordBoundary.matcher(span.stemmed);
|
|
||||||
|
|
||||||
int ws = 0;
|
|
||||||
int ss = 0;
|
|
||||||
boolean didSplit = false;
|
|
||||||
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
|
|
||||||
ws = wordMatcher.start()+1;
|
|
||||||
ss = stemmedMatcher.start()+1;
|
|
||||||
if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_"))
|
|
||||||
|| nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-")))
|
|
||||||
{
|
|
||||||
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
|
|
||||||
asTokens2.add(combined);
|
|
||||||
didSplit = true;
|
|
||||||
num = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!didSplit) {
|
|
||||||
asTokens2.add(span.word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (num) {
|
|
||||||
return List.of(asTokens2);
|
|
||||||
}
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String splitAtNumBoundary(String in, int splitPoint, String joiner) {
|
|
||||||
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,8 +1,10 @@
|
|||||||
package nu.marginalia.functions.searchquery.segmentation;
|
package nu.marginalia.functions.searchquery.segmentation;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
||||||
import it.unimi.dsi.fastutil.longs.LongHash;
|
import it.unimi.dsi.fastutil.longs.LongHash;
|
||||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
|
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.DataOutputStream;
|
import java.io.DataOutputStream;
|
||||||
@ -24,6 +26,19 @@ public class NgramLexicon {
|
|||||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||||
private static final HasherGroup unorderedHasher = HasherGroup.unordered();
|
private static final HasherGroup unorderedHasher = HasherGroup.unordered();
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NgramLexicon(LanguageModels models) {
|
||||||
|
try {
|
||||||
|
loadCounts(models.segments);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public NgramLexicon() {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public List<SentenceSegment> findSegments(int length, String... parts) {
|
public List<SentenceSegment> findSegments(int length, String... parts) {
|
||||||
// Don't look for ngrams longer than the sentence
|
// Don't look for ngrams longer than the sentence
|
||||||
if (parts.length < length) return List.of();
|
if (parts.length < length) return List.of();
|
||||||
|
@ -1,69 +0,0 @@
|
|||||||
package nu.marginalia.util.ngrams;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.StandardOpenOption;
|
|
||||||
import java.util.BitSet;
|
|
||||||
|
|
||||||
// It's unclear why this exists, we should probably use a BitSet instead?
|
|
||||||
// Chesterton's fence?
|
|
||||||
public class DenseBitMap {
|
|
||||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
|
||||||
|
|
||||||
public final long cardinality;
|
|
||||||
private final ByteBuffer buffer;
|
|
||||||
|
|
||||||
public DenseBitMap(long cardinality) {
|
|
||||||
this.cardinality = cardinality;
|
|
||||||
|
|
||||||
boolean misaligned = (cardinality & 7) > 0;
|
|
||||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static DenseBitMap loadFromFile(Path file) throws IOException {
|
|
||||||
long size = Files.size(file);
|
|
||||||
var dbm = new DenseBitMap(size/8);
|
|
||||||
|
|
||||||
try (var bc = Files.newByteChannel(file)) {
|
|
||||||
while (dbm.buffer.position() < dbm.buffer.capacity()) {
|
|
||||||
bc.read(dbm.buffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dbm.buffer.clear();
|
|
||||||
|
|
||||||
return dbm;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void writeToFile(Path file) throws IOException {
|
|
||||||
|
|
||||||
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
|
||||||
while (buffer.position() < buffer.capacity()) {
|
|
||||||
bc.write(buffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
buffer.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean get(long pos) {
|
|
||||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Set the bit indexed by pos, returns
|
|
||||||
* its previous value.
|
|
||||||
*/
|
|
||||||
public boolean set(long pos) {
|
|
||||||
int offset = (int) (pos >>> 3);
|
|
||||||
int oldVal = buffer.get(offset);
|
|
||||||
int mask = (byte) 1 << (int) (pos & 7);
|
|
||||||
buffer.put(offset, (byte) (oldVal | mask));
|
|
||||||
return (oldVal & mask) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear(long pos) {
|
|
||||||
int offset = (int)(pos >>> 3);
|
|
||||||
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,64 +0,0 @@
|
|||||||
package nu.marginalia.util.ngrams;
|
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
|
||||||
import com.google.common.hash.HashFunction;
|
|
||||||
import com.google.common.hash.Hashing;
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import nu.marginalia.LanguageModels;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
public class NGramBloomFilter {
|
|
||||||
private final DenseBitMap bitMap;
|
|
||||||
private static final PorterStemmer ps = new PorterStemmer();
|
|
||||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public NGramBloomFilter(LanguageModels lm) throws IOException {
|
|
||||||
this(loadSafely(lm.ngramBloomFilter));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static DenseBitMap loadSafely(Path path) throws IOException {
|
|
||||||
if (Files.isRegularFile(path)) {
|
|
||||||
return DenseBitMap.loadFromFile(path);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.warn("NGrams file missing " + path);
|
|
||||||
return new DenseBitMap(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public NGramBloomFilter(DenseBitMap bitMap) {
|
|
||||||
this.bitMap = bitMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isKnownNGram(String word) {
|
|
||||||
long bit = bitForWord(word, bitMap.cardinality);
|
|
||||||
|
|
||||||
return bitMap.get(bit);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static NGramBloomFilter load(Path file) throws IOException {
|
|
||||||
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final Pattern underscore = Pattern.compile("_");
|
|
||||||
|
|
||||||
private static long bitForWord(String s, long n) {
|
|
||||||
String[] parts = underscore.split(s);
|
|
||||||
long hc = 0;
|
|
||||||
for (String part : parts) {
|
|
||||||
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
|
|
||||||
}
|
|
||||||
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy;
|
|||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.util.language.EnglishDictionary;
|
import nu.marginalia.util.language.EnglishDictionary;
|
||||||
import nu.marginalia.util.ngrams.NGramBloomFilter;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams.bin"),
|
|
||||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
languageModelsHome.resolve("lid.176.ftz")
|
languageModelsHome.resolve("lid.176.ftz"),
|
||||||
|
languageModelsHome.resolve("segments.bin")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams.bin"),
|
|
||||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
languageModelsHome.resolve("lid.176.ftz")
|
languageModelsHome.resolve("lid.176.ftz"),
|
||||||
|
languageModelsHome.resolve("segments.bin")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
|||||||
var languageModelsHome = getLanguageModelsPath();
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
languageModelsHome.resolve("ngrams.bin"),
|
|
||||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
languageModelsHome.resolve("lid.176.ftz")
|
languageModelsHome.resolve("lid.176.ftz"),
|
||||||
|
languageModelsHome.resolve("segments.bin")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye
|
|||||||
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
||||||
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
||||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||||
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
|
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin
|
||||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
||||||
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz
|
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user