(qs) Retire NGramBloomFilter, integrate new segmentation model instead

This commit is contained in:
Viktor Lofgren 2024-03-19 10:33:29 +01:00
parent afc047cd27
commit d8f4e7d72b
17 changed files with 39 additions and 318 deletions

View File

@ -3,7 +3,6 @@ package nu.marginalia;
import java.nio.file.Path; import java.nio.file.Path;
public class LanguageModels { public class LanguageModels {
public final Path ngramBloomFilter;
public final Path termFrequencies; public final Path termFrequencies;
public final Path openNLPSentenceDetectionData; public final Path openNLPSentenceDetectionData;
@ -11,20 +10,21 @@ public class LanguageModels {
public final Path posDict; public final Path posDict;
public final Path openNLPTokenData; public final Path openNLPTokenData;
public final Path fasttextLanguageModel; public final Path fasttextLanguageModel;
public final Path segments;
public LanguageModels(Path ngramBloomFilter, public LanguageModels(Path termFrequencies,
Path termFrequencies,
Path openNLPSentenceDetectionData, Path openNLPSentenceDetectionData,
Path posRules, Path posRules,
Path posDict, Path posDict,
Path openNLPTokenData, Path openNLPTokenData,
Path fasttextLanguageModel) { Path fasttextLanguageModel,
this.ngramBloomFilter = ngramBloomFilter; Path segments) {
this.termFrequencies = termFrequencies; this.termFrequencies = termFrequencies;
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
this.posRules = posRules; this.posRules = posRules;
this.posDict = posDict; this.posDict = posDict;
this.openNLPTokenData = openNLPTokenData; this.openNLPTokenData = openNLPTokenData;
this.fasttextLanguageModel = fasttextLanguageModel; this.fasttextLanguageModel = fasttextLanguageModel;
this.segments = segments;
} }
} }

View File

@ -85,13 +85,14 @@ public class WmsaHome {
final Path home = getHomePath(); final Path home = getHomePath();
return new LanguageModels( return new LanguageModels(
home.resolve("model/ngrams.bin"),
home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/tfreq-new-algo3.bin"),
home.resolve("model/opennlp-sentence.bin"), home.resolve("model/opennlp-sentence.bin"),
home.resolve("model/English.RDR"), home.resolve("model/English.RDR"),
home.resolve("model/English.DICT"), home.resolve("model/English.DICT"),
home.resolve("model/opennlp-tok.bin"), home.resolve("model/opennlp-tok.bin"),
home.resolve("model/lid.176.ftz")); home.resolve("model/lid.176.ftz"),
home.resolve("model/segments.bin")
);
} }
public static Path getAtagsPath() { public static Path getAtagsPath() {

View File

@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*;
import nu.marginalia.functions.execution.api.*; import nu.marginalia.functions.execution.api.*;
import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageId;
import java.nio.file.Path;
@Singleton @Singleton
public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase {
private final ExecutorActorControlService actorControlService; private final ExecutorActorControlService actorControlService;

View File

@ -26,13 +26,13 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"), languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz") languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
); );
} }
} }

View File

@ -26,13 +26,13 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"), languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz") languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
); );
} }
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser.variant; package nu.marginalia.functions.searchquery.query_parser.variant;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord;
import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph;
import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; import nu.marginalia.functions.searchquery.segmentation.NgramLexicon;
@ -15,13 +16,15 @@ public class QueryExpansion {
private static final PorterStemmer ps = new PorterStemmer(); private static final PorterStemmer ps = new PorterStemmer();
private final TermFrequencyDict dict; private final TermFrequencyDict dict;
private final NgramLexicon lexicon; private final NgramLexicon lexicon;
List<ExpansionStrategy> expansionStrategies = List.of(
private final List<ExpansionStrategy> expansionStrategies = List.of(
this::joinDashes, this::joinDashes,
this::splitWordNum, this::splitWordNum,
this::joinTerms, this::joinTerms,
this::createSegments this::createSegments
); );
@Inject
public QueryExpansion(TermFrequencyDict dict, public QueryExpansion(TermFrequencyDict dict,
NgramLexicon lexicon NgramLexicon lexicon
) { ) {
@ -97,6 +100,7 @@ public class QueryExpansion {
String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); String[] words = nodes.stream().map(QWord::word).toArray(String[]::new);
// Look for known segments within the query
for (int length = 2; length < Math.min(10, words.length); length++) { for (int length = 2; length < Math.min(10, words.length); length++) {
for (var segment : lexicon.findSegments(length, words)) { for (var segment : lexicon.findSegments(length, words)) {
int start = segment.start(); int start = segment.start();

View File

@ -1,40 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
/** Variant strategy that combines word that have dashes, as sometimes lawn-chair
* gets spelled lawnchair */
public class CombineDashes implements VariantStrategy {
final Pattern dashBoundary = Pattern.compile("-");
public CombineDashes() {
}
@Override
public Collection<? extends List<String>> constructVariants(List<QueryWord> words) {
List<String> asTokens2 = new ArrayList<>();
boolean dash = false;
for (var span : words) {
var matcher = dashBoundary.matcher(span.word);
if (matcher.find()) {
String combined = dashBoundary.matcher(span.word).replaceAll("");
asTokens2.add(combined);
}
asTokens2.add(span.word);
}
if (dash) {
return List.of(asTokens2);
}
return Collections.emptyList();
}
}

View File

@ -1,58 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/** Variant strategy that merges tokens that are adjacent, where the combined token
* has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */
public class JoinTerms implements VariantStrategy {
private final TermFrequencyDict dict;
private final PorterStemmer ps;
public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) {
this.dict = dict;
this.ps = ps;
}
@Override
public Collection<? extends List<String>> constructVariants(List<QueryWord> span) {
List<List<String>> ret = new ArrayList<>();
for (int i = 0; i < span.size()-1; i++) {
var a = span.get(i);
var b = span.get(i+1);
var stemmed = ps.stemWord(a.word + b.word);
double scoreCombo = dict.getTermFreqStemmed(stemmed);
if (scoreCombo > 10000) {
List<String> asTokens = new ArrayList<>();
for (int j = 0; j < i; j++) {
var word = span.get(j).word;
asTokens.add(word);
}
{
var word = a.word + b.word;
asTokens.add(word);
}
for (int j = i+2; j < span.size(); j++) {
var word = span.get(j).word;
asTokens.add(word);
}
ret.add(asTokens);
}
}
return ret;
}
}

View File

@ -1,65 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.variant.strategy;
import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord;
import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy;
import nu.marginalia.util.ngrams.NGramBloomFilter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
/** Variant strategy that splits tokens at the boundary between a number and a word.
*/
public class SplitWordNum implements VariantStrategy {
final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
private final NGramBloomFilter nGramBloomFilter;
public SplitWordNum(NGramBloomFilter nGramBloomFilter) {
this.nGramBloomFilter = nGramBloomFilter;
}
@Override
public Collection<? extends List<String>> constructVariants(List<QueryWord> ls) {
List<String> asTokens2 = new ArrayList<>();
boolean num = false;
for (var span : ls) {
var wordMatcher = numWordBoundary.matcher(span.word);
var stemmedMatcher = numWordBoundary.matcher(span.stemmed);
int ws = 0;
int ss = 0;
boolean didSplit = false;
while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) {
ws = wordMatcher.start()+1;
ss = stemmedMatcher.start()+1;
if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_"))
|| nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-")))
{
String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_");
asTokens2.add(combined);
didSplit = true;
num = true;
}
}
if (!didSplit) {
asTokens2.add(span.word);
}
}
if (num) {
return List.of(asTokens2);
}
return Collections.emptyList();
}
private String splitAtNumBoundary(String in, int splitPoint, String joiner) {
return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1);
}
}

View File

@ -1,8 +1,10 @@
package nu.marginalia.functions.searchquery.segmentation; package nu.marginalia.functions.searchquery.segmentation;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap;
import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongHash;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.LanguageModels;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
@ -24,6 +26,19 @@ public class NgramLexicon {
private static final HasherGroup orderedHasher = HasherGroup.ordered(); private static final HasherGroup orderedHasher = HasherGroup.ordered();
private static final HasherGroup unorderedHasher = HasherGroup.unordered(); private static final HasherGroup unorderedHasher = HasherGroup.unordered();
@Inject
public NgramLexicon(LanguageModels models) {
try {
loadCounts(models.segments);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public NgramLexicon() {
}
public List<SentenceSegment> findSegments(int length, String... parts) { public List<SentenceSegment> findSegments(int length, String... parts) {
// Don't look for ngrams longer than the sentence // Don't look for ngrams longer than the sentence
if (parts.length < length) return List.of(); if (parts.length < length) return List.of();

View File

@ -1,69 +0,0 @@
package nu.marginalia.util.ngrams;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.BitSet;
// It's unclear why this exists, we should probably use a BitSet instead?
// Chesterton's fence?
public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
public final long cardinality;
private final ByteBuffer buffer;
public DenseBitMap(long cardinality) {
this.cardinality = cardinality;
boolean misaligned = (cardinality & 7) > 0;
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
}
public static DenseBitMap loadFromFile(Path file) throws IOException {
long size = Files.size(file);
var dbm = new DenseBitMap(size/8);
try (var bc = Files.newByteChannel(file)) {
while (dbm.buffer.position() < dbm.buffer.capacity()) {
bc.read(dbm.buffer);
}
}
dbm.buffer.clear();
return dbm;
}
public void writeToFile(Path file) throws IOException {
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
while (buffer.position() < buffer.capacity()) {
bc.write(buffer);
}
}
buffer.clear();
}
public boolean get(long pos) {
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
}
/** Set the bit indexed by pos, returns
* its previous value.
*/
public boolean set(long pos) {
int offset = (int) (pos >>> 3);
int oldVal = buffer.get(offset);
int mask = (byte) 1 << (int) (pos & 7);
buffer.put(offset, (byte) (oldVal | mask));
return (oldVal & mask) != 0;
}
public void clear(long pos) {
int offset = (int)(pos >>> 3);
buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7))));
}
}

View File

@ -1,64 +0,0 @@
package nu.marginalia.util.ngrams;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import nu.marginalia.LanguageModels;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.regex.Pattern;
public class NGramBloomFilter {
private final DenseBitMap bitMap;
private static final PorterStemmer ps = new PorterStemmer();
private static final HashFunction hasher = Hashing.murmur3_128(0);
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
@Inject
public NGramBloomFilter(LanguageModels lm) throws IOException {
this(loadSafely(lm.ngramBloomFilter));
}
private static DenseBitMap loadSafely(Path path) throws IOException {
if (Files.isRegularFile(path)) {
return DenseBitMap.loadFromFile(path);
}
else {
logger.warn("NGrams file missing " + path);
return new DenseBitMap(1);
}
}
public NGramBloomFilter(DenseBitMap bitMap) {
this.bitMap = bitMap;
}
public boolean isKnownNGram(String word) {
long bit = bitForWord(word, bitMap.cardinality);
return bitMap.get(bit);
}
public static NGramBloomFilter load(Path file) throws IOException {
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
}
private static final Pattern underscore = Pattern.compile("_");
private static long bitForWord(String s, long n) {
String[] parts = underscore.split(s);
long hc = 0;
for (String part : parts) {
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
}
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
}
}

View File

@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.util.language.EnglishDictionary;
import nu.marginalia.util.ngrams.NGramBloomFilter;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;

View File

@ -26,13 +26,13 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"), languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz") languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
); );
} }
} }

View File

@ -26,13 +26,13 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"), languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz") languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
); );
} }
} }

View File

@ -26,13 +26,13 @@ public class TestLanguageModels {
var languageModelsHome = getLanguageModelsPath(); var languageModelsHome = getLanguageModelsPath();
return new LanguageModels( return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"), languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz") languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
); );
} }
} }

View File

@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye
download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR
download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz