mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(qs) Retire NGramBloomFilter, integrate new segmentation model instead
This commit is contained in:
parent
212d101727
commit
3c75057dcd
@ -3,7 +3,6 @@ package nu.marginalia;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class LanguageModels {
|
||||
public final Path ngramBloomFilter;
|
||||
public final Path termFrequencies;
|
||||
|
||||
public final Path openNLPSentenceDetectionData;
|
||||
@ -11,20 +10,21 @@ public class LanguageModels {
|
||||
public final Path posDict;
|
||||
public final Path openNLPTokenData;
|
||||
public final Path fasttextLanguageModel;
|
||||
public final Path segments;
|
||||
|
||||
public LanguageModels(Path ngramBloomFilter,
|
||||
Path termFrequencies,
|
||||
public LanguageModels(Path termFrequencies,
|
||||
Path openNLPSentenceDetectionData,
|
||||
Path posRules,
|
||||
Path posDict,
|
||||
Path openNLPTokenData,
|
||||
Path fasttextLanguageModel) {
|
||||
this.ngramBloomFilter = ngramBloomFilter;
|
||||
Path fasttextLanguageModel,
|
||||
Path segments) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
this.posRules = posRules;
|
||||
this.posDict = posDict;
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
}
|
||||
|
@ -96,13 +96,14 @@ public class WmsaHome {
|
||||
final Path home = getHomePath();
|
||||
|
||||
return new LanguageModels(
|
||||
home.resolve("model/ngrams.bin"),
|
||||
home.resolve("model/tfreq-new-algo3.bin"),
|
||||
home.resolve("model/opennlp-sentence.bin"),
|
||||
home.resolve("model/English.RDR"),
|
||||
home.resolve("model/English.DICT"),
|
||||
home.resolve("model/opennlp-tok.bin"),
|
||||
home.resolve("model/lid.176.ftz"));
|
||||
home.resolve("model/lid.176.ftz"),
|
||||
home.resolve("model/segments.bin")
|
||||
);
|
||||
}
|
||||
|
||||
public static Path getAtagsPath() {
|
||||
|
@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.functions.execution.api.*;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Singleton
|
||||
public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase {
|
||||
private final ExecutorActorControlService actorControlService;
|
||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.variant;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
|
||||
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
|
||||
@ -15,13 +16,15 @@ public class QueryExpansion {
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
private final TermFrequencyDict dict;
|
||||
private final NgramLexicon lexicon;
|
||||
List<ExpansionStrategy> expansionStrategies = List.of(
|
||||
|
||||
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
||||
this::joinDashes,
|
||||
this::splitWordNum,
|
||||
this::joinTerms,
|
||||
this::createSegments
|
||||
);
|
||||
|
||||
@Inject
|
||||
public QueryExpansion(TermFrequencyDict dict,
|
||||
NgramLexicon lexicon
|
||||
) {
|
||||
@ -97,6 +100,7 @@ public class QueryExpansion {
|
||||
|
||||
String[] words = nodes.stream().map(QWord::word).toArray(String[]::new);
|
||||
|
||||
// Look for known segments within the query
|
||||
for (int length = 2; length < Math.min(10, words.length); length++) {
|
||||
for (var segment : lexicon.findSegments(length, words)) {
|
||||
int start = segment.start();
|
||||
|
@ -1,40 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** Variant strategy that combines word that have dashes, as sometimes lawn-chair
|
||||
* gets spelled lawnchair */
|
||||
public class CombineDashes implements VariantStrategy {
|
||||
final Pattern dashBoundary = Pattern.compile("-");
|
||||
|
||||
public CombineDashes() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> words) {
|
||||
List<String> asTokens2 = new ArrayList<>();
|
||||
boolean dash = false;
|
||||
|
||||
for (var span : words) {
|
||||
var matcher = dashBoundary.matcher(span.word);
|
||||
if (matcher.find()) {
|
||||
String combined = dashBoundary.matcher(span.word).replaceAll("");
|
||||
asTokens2.add(combined);
|
||||
}
|
||||
|
||||
asTokens2.add(span.word);
|
||||
}
|
||||
|
||||
if (dash) {
|
||||
return List.of(asTokens2);
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
@ -1,58 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/** Variant strategy that merges tokens that are adjacent, where the combined token
|
||||
* has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */
|
||||
public class JoinTerms implements VariantStrategy {
|
||||
private final TermFrequencyDict dict;
|
||||
private final PorterStemmer ps;
|
||||
|
||||
public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) {
|
||||
this.dict = dict;
|
||||
this.ps = ps;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> span) {
|
||||
List<List<String>> ret = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < span.size()-1; i++) {
|
||||
var a = span.get(i);
|
||||
var b = span.get(i+1);
|
||||
|
||||
var stemmed = ps.stemWord(a.word + b.word);
|
||||
|
||||
double scoreCombo = dict.getTermFreqStemmed(stemmed);
|
||||
|
||||
if (scoreCombo > 10000) {
|
||||
List<String> asTokens = new ArrayList<>();
|
||||
|
||||
for (int j = 0; j < i; j++) {
|
||||
var word = span.get(j).word;
|
||||
asTokens.add(word);
|
||||
}
|
||||
{
|
||||
var word = a.word + b.word;
|
||||
asTokens.add(word);
|
||||
}
|
||||
for (int j = i+2; j < span.size(); j++) {
|
||||
var word = span.get(j).word;
|
||||
asTokens.add(word);
|
||||
}
|
||||
|
||||
ret.add(asTokens);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -1,65 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
|
||||
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
|
||||
import nu.marginalia.util.ngrams.NGramBloomFilter;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/** Variant strategy that splits tokens at the boundary between a number and a word.
|
||||
*/
|
||||
public class SplitWordNum implements VariantStrategy {
|
||||
|
||||
|
||||
final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
|
||||
private final NGramBloomFilter nGramBloomFilter;
|
||||
|
||||
public SplitWordNum(NGramBloomFilter nGramBloomFilter) {
|
||||
this.nGramBloomFilter = nGramBloomFilter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<? extends List<String>> constructVariants(List<QueryWord> ls) {
|
||||
List<String> asTokens2 = new ArrayList<>();
|
||||
|
||||
boolean num = false;
|
||||
|
||||
for (var span : ls) {
|
||||
var wordMatcher = numWordBoundary.matcher(span.word);
|
||||
var stemmedMatcher = numWordBoundary.matcher(span.stemmed);
|
||||
|
||||
int ws = 0;
|
||||
int ss = 0;
|
||||
boolean didSplit = false;
|
||||
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
|
||||
ws = wordMatcher.start()+1;
|
||||
ss = stemmedMatcher.start()+1;
|
||||
if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_"))
|
||||
|| nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-")))
|
||||
{
|
||||
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
|
||||
asTokens2.add(combined);
|
||||
didSplit = true;
|
||||
num = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!didSplit) {
|
||||
asTokens2.add(span.word);
|
||||
}
|
||||
}
|
||||
|
||||
if (num) {
|
||||
return List.of(asTokens2);
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
private String splitAtNumBoundary(String in, int splitPoint, String joiner) {
|
||||
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
|
||||
}
|
||||
}
|
@ -1,8 +1,10 @@
|
||||
package nu.marginalia.functions.searchquery.segmentation;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongHash;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.LanguageModels;
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
@ -24,6 +26,19 @@ public class NgramLexicon {
|
||||
private static final HasherGroup orderedHasher = HasherGroup.ordered();
|
||||
private static final HasherGroup unorderedHasher = HasherGroup.unordered();
|
||||
|
||||
@Inject
|
||||
public NgramLexicon(LanguageModels models) {
|
||||
try {
|
||||
loadCounts(models.segments);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public NgramLexicon() {
|
||||
|
||||
}
|
||||
|
||||
public List<SentenceSegment> findSegments(int length, String... parts) {
|
||||
// Don't look for ngrams longer than the sentence
|
||||
if (parts.length < length) return List.of();
|
||||
|
@ -1,69 +0,0 @@
|
||||
package nu.marginalia.util.ngrams;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.BitSet;
|
||||
|
||||
// It's unclear why this exists, we should probably use a BitSet instead?
|
||||
// Chesterton's fence?
|
||||
public class DenseBitMap {
|
||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||
|
||||
public final long cardinality;
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public DenseBitMap(long cardinality) {
|
||||
this.cardinality = cardinality;
|
||||
|
||||
boolean misaligned = (cardinality & 7) > 0;
|
||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||
}
|
||||
|
||||
public static DenseBitMap loadFromFile(Path file) throws IOException {
|
||||
long size = Files.size(file);
|
||||
var dbm = new DenseBitMap(size/8);
|
||||
|
||||
try (var bc = Files.newByteChannel(file)) {
|
||||
while (dbm.buffer.position() < dbm.buffer.capacity()) {
|
||||
bc.read(dbm.buffer);
|
||||
}
|
||||
}
|
||||
dbm.buffer.clear();
|
||||
|
||||
return dbm;
|
||||
}
|
||||
|
||||
public void writeToFile(Path file) throws IOException {
|
||||
|
||||
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
while (buffer.position() < buffer.capacity()) {
|
||||
bc.write(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
public boolean get(long pos) {
|
||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||
}
|
||||
|
||||
/** Set the bit indexed by pos, returns
|
||||
* its previous value.
|
||||
*/
|
||||
public boolean set(long pos) {
|
||||
int offset = (int) (pos >>> 3);
|
||||
int oldVal = buffer.get(offset);
|
||||
int mask = (byte) 1 << (int) (pos & 7);
|
||||
buffer.put(offset, (byte) (oldVal | mask));
|
||||
return (oldVal & mask) != 0;
|
||||
}
|
||||
|
||||
public void clear(long pos) {
|
||||
int offset = (int)(pos >>> 3);
|
||||
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
package nu.marginalia.util.ngrams;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class NGramBloomFilter {
|
||||
private final DenseBitMap bitMap;
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
|
||||
|
||||
@Inject
|
||||
public NGramBloomFilter(LanguageModels lm) throws IOException {
|
||||
this(loadSafely(lm.ngramBloomFilter));
|
||||
}
|
||||
|
||||
private static DenseBitMap loadSafely(Path path) throws IOException {
|
||||
if (Files.isRegularFile(path)) {
|
||||
return DenseBitMap.loadFromFile(path);
|
||||
}
|
||||
else {
|
||||
logger.warn("NGrams file missing " + path);
|
||||
return new DenseBitMap(1);
|
||||
}
|
||||
}
|
||||
|
||||
public NGramBloomFilter(DenseBitMap bitMap) {
|
||||
this.bitMap = bitMap;
|
||||
}
|
||||
|
||||
public boolean isKnownNGram(String word) {
|
||||
long bit = bitForWord(word, bitMap.cardinality);
|
||||
|
||||
return bitMap.get(bit);
|
||||
}
|
||||
|
||||
public static NGramBloomFilter load(Path file) throws IOException {
|
||||
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
||||
}
|
||||
|
||||
private static final Pattern underscore = Pattern.compile("_");
|
||||
|
||||
private static long bitForWord(String s, long n) {
|
||||
String[] parts = underscore.split(s);
|
||||
long hc = 0;
|
||||
for (String part : parts) {
|
||||
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
|
||||
}
|
||||
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
|
||||
}
|
||||
|
||||
}
|
@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.util.language.EnglishDictionary;
|
||||
import nu.marginalia.util.ngrams.NGramBloomFilter;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -26,13 +26,13 @@ public class TestLanguageModels {
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
languageModelsHome.resolve("lid.176.ftz"),
|
||||
languageModelsHome.resolve("segments.bin")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye
|
||||
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
|
||||
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
|
||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
|
||||
download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin
|
||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
||||
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user