(language) fasttext based language filter

This commit is contained in:
Viktor Lofgren 2023-08-16 15:48:12 +02:00
parent 4598c7f40f
commit 46d761f34f
20 changed files with 189 additions and 66 deletions

View File

@ -10,13 +10,21 @@ public class LanguageModels {
public final Path posRules; public final Path posRules;
public final Path posDict; public final Path posDict;
public final Path openNLPTokenData; public final Path openNLPTokenData;
public final Path fasttextLanguageModel;
public LanguageModels(Path ngramBloomFilter, Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData) { public LanguageModels(Path ngramBloomFilter,
Path termFrequencies,
Path openNLPSentenceDetectionData,
Path posRules,
Path posDict,
Path openNLPTokenData,
Path fasttextLanguageModel) {
this.ngramBloomFilter = ngramBloomFilter; this.ngramBloomFilter = ngramBloomFilter;
this.termFrequencies = termFrequencies; this.termFrequencies = termFrequencies;
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
this.posRules = posRules; this.posRules = posRules;
this.posDict = posDict; this.posDict = posDict;
this.openNLPTokenData = openNLPTokenData; this.openNLPTokenData = openNLPTokenData;
this.fasttextLanguageModel = fasttextLanguageModel;
} }
} }

View File

@ -87,7 +87,8 @@ public class WmsaHome {
home.resolve("model/opennlp-sentence.bin"), home.resolve("model/opennlp-sentence.bin"),
home.resolve("model/English.RDR"), home.resolve("model/English.RDR"),
home.resolve("model/English.DICT"), home.resolve("model/English.DICT"),
home.resolve("model/opennlp-tok.bin")); home.resolve("model/opennlp-tok.bin"),
home.resolve("model/lid.176.ftz"));
} }
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug"); private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");

View File

@ -31,7 +31,8 @@ public class TestLanguageModels {
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin") languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
); );
} }
} }

View File

@ -31,7 +31,8 @@ public class TestLanguageModels {
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin") languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
); );
} }
} }

View File

@ -16,13 +16,15 @@ public class DocumentLanguageData {
public final DocumentSentence[] sentences; public final DocumentSentence[] sentences;
public final DocumentSentence[] titleSentences; public final DocumentSentence[] titleSentences;
public final TObjectIntHashMap<String> wordCount; public final TObjectIntHashMap<String> wordCount;
public final String text;
/** for test convenience */ /** for test convenience */
public static DocumentLanguageData empty() { public static DocumentLanguageData empty() {
return new DocumentLanguageData( return new DocumentLanguageData(
new DocumentSentence[0], new DocumentSentence[0],
new DocumentSentence[0], new DocumentSentence[0],
new TObjectIntHashMap<>() new TObjectIntHashMap<>(),
""
); );
} }

View File

@ -71,7 +71,7 @@ public class SentenceExtractor {
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences); TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase()); var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts); return new DocumentLanguageData(textSentences, titleSentences, counts, text);
} }
public DocumentLanguageData extractSentences(String text, String title) { public DocumentLanguageData extractSentences(String text, String title) {
@ -79,7 +79,7 @@ public class SentenceExtractor {
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences); TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase()); var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts); return new DocumentLanguageData(textSentences, titleSentences, counts, text);
} }
private String getTitle(Document doc, DocumentSentence[] textSentences) { private String getTitle(Document doc, DocumentSentence[] textSentences) {

View File

@ -72,6 +72,7 @@ dependencies {
implementation libs.zstd implementation libs.zstd
implementation libs.bundles.mariadb implementation libs.bundles.mariadb
implementation libs.bundles.nlp
implementation libs.trove implementation libs.trove
implementation libs.fastutil implementation libs.fastutil

View File

@ -0,0 +1,26 @@
package nu.marginalia.converting.language;
import com.github.jfasttext.JFastText;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.model.DocumentLanguageData;
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
private final JFastText jft = new JFastText();
public FasttextLanguagePredictionModel(LanguageModels lm) throws Exception {
jft.loadModel(lm.fasttextLanguageModel.toString());
}
@Override
public double predictEnglish(DocumentLanguageData dld) {
if ("__label__en".equals(jft.predict(dld.text))) {
return 1.0;
}
return 0.;
}
@Override
public boolean hasPoorAccuracy() {
return false;
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.converting.language; package nu.marginalia.converting.language;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.encoding.UnicodeRanges; import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -8,10 +10,6 @@ import org.slf4j.LoggerFactory;
import javax.inject.Inject; import javax.inject.Inject;
import javax.inject.Singleton; import javax.inject.Singleton;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
@ -20,48 +18,29 @@ public class LanguageFilter {
private static final Set<String> interestingLanguages = Set.of("en", "en-us", "en-gb", "eng", "english"); private static final Set<String> interestingLanguages = Set.of("en", "en-us", "en-gb", "eng", "english");
private static final Set<String> englishWords = new HashSet<>();
private static final Logger logger = LoggerFactory.getLogger(LanguageFilter.class); private static final Logger logger = LoggerFactory.getLogger(LanguageFilter.class);
static {
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
) {
for (;;) {
String s = br.readLine();
if (s == null) {
break;
}
englishWords.add(s.toLowerCase());
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
} private final LanguagePredictionModel languagePredictionModel;
/** Returns the probability the language is in English */
public double dictionaryAgreement(DocumentLanguageData dld) { public double dictionaryAgreement(DocumentLanguageData dld) {
Set<String> seenWords = new HashSet<>(); return languagePredictionModel.predictEnglish(dld);
int englishCount = 0;
for (var sent : dld.sentences) {
for (var word : sent.wordsLowerCase) {
if (seenWords.add(word) && englishWords.contains(word)) {
englishCount++;
}
}
}
double englishAgreement = englishCount / (double) Math.min(seenWords.size(), englishWords.size());
logger.debug("Agreement: {}", englishAgreement);
return englishAgreement;
} }
@Inject @Inject
public LanguageFilter() { @SneakyThrows
public LanguageFilter(LanguageModels lm) {
try {
if (Boolean.getBoolean("disable-fasttext")) {
languagePredictionModel = new UngaBungaLanguagePredictionModel();
}
else {
languagePredictionModel = new FasttextLanguagePredictionModel(lm);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
} }
public Optional<Boolean> isPageInterestingByHtmlTag(Document parsed) { public Optional<Boolean> isPageInterestingByHtmlTag(Document parsed) {
@ -72,20 +51,17 @@ public class LanguageFilter {
.map(interestingLanguages::contains); .map(interestingLanguages::contains);
} }
public Optional<Boolean> isPageInterestingByMetaLanguage(Document parsed) {
return parsed.getElementsByTag("meta").stream().filter(elem -> "content-language".equalsIgnoreCase(elem.attr("http-equiv")))
.map(elem -> elem.attr("content"))
.filter(s -> !s.isBlank())
.map(String::toLowerCase)
.map(interestingLanguages::contains)
.findAny();
}
public boolean isBlockedUnicodeRange(String data) { public boolean isBlockedUnicodeRange(String data) {
if (!languagePredictionModel.hasPoorAccuracy()) {
return false;
}
for (var range: UnicodeRanges.values()) { for (var range: UnicodeRanges.values()) {
if (range.test(data)) if (range.test(data))
return true; return true;
} }
return false; return false;
} }
} }

View File

@ -0,0 +1,11 @@
package nu.marginalia.converting.language;
import nu.marginalia.language.model.DocumentLanguageData;
public interface LanguagePredictionModel {
/** Returns the probability the language is in English */
double predictEnglish(DocumentLanguageData dld);
boolean hasPoorAccuracy();
}

View File

@ -0,0 +1,51 @@
package nu.marginalia.converting.language;
import nu.marginalia.language.model.DocumentLanguageData;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
private static final Set<String> englishWords = new HashSet<>();
public UngaBungaLanguagePredictionModel() throws Exception {
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
) {
for (;;) {
String s = br.readLine();
if (s == null) {
break;
}
englishWords.add(s.toLowerCase());
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
}
@Override
public double predictEnglish(DocumentLanguageData dld) {
Set<String> seenWords = new HashSet<>();
int englishCount = 0;
for (var sent : dld.sentences) {
for (var word : sent.wordsLowerCase) {
if (seenWords.add(word) && englishWords.contains(word)) {
englishCount++;
}
}
}
return englishCount / (double) Math.min(seenWords.size(), englishWords.size());
}
@Override
public boolean hasPoorAccuracy() {
return true;
}
}

View File

@ -17,7 +17,10 @@ import java.net.URISyntaxException;
import java.util.*; import java.util.*;
public abstract class AbstractDocumentProcessorPlugin { public abstract class AbstractDocumentProcessorPlugin {
protected LanguageFilter languageFilter = new LanguageFilter(); protected LanguageFilter languageFilter;
public AbstractDocumentProcessorPlugin(LanguageFilter languageFilter) {
this.languageFilter = languageFilter;
}
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
public abstract boolean isApplicable(CrawledDocument doc); public abstract boolean isApplicable(CrawledDocument doc);

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.MetaRobotsTag;
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
@ -64,6 +65,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
@Inject @Inject
public HtmlDocumentProcessorPlugin( public HtmlDocumentProcessorPlugin(
@Named("min-document-quality") Double minDocumentQuality, @Named("min-document-quality") Double minDocumentQuality,
LanguageFilter languageFilter,
SentenceExtractor sentenceExtractor, SentenceExtractor sentenceExtractor,
FeatureExtractor featureExtractor, FeatureExtractor featureExtractor,
TitleExtractor titleExtractor, TitleExtractor titleExtractor,
@ -74,6 +76,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
DocumentGeneratorExtractor documentGeneratorExtractor, DocumentGeneratorExtractor documentGeneratorExtractor,
HtmlProcessorSpecializations specializations) HtmlProcessorSpecializations specializations)
{ {
super(languageFilter);
this.documentLengthLogic = documentLengthLogic; this.documentLengthLogic = documentLengthLogic;
this.minDocumentQuality = minDocumentQuality; this.minDocumentQuality = minDocumentQuality;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractor = sentenceExtractor;

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.converting.language.LanguageFilter;
import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawledDomain;
@ -38,11 +39,13 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
@Inject @Inject
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength, public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
LanguageFilter languageFilter,
SentenceExtractor sentenceExtractor, SentenceExtractor sentenceExtractor,
DocumentKeywordExtractor keywordExtractor, DocumentKeywordExtractor keywordExtractor,
DocumentLengthLogic documentLengthLogic DocumentLengthLogic documentLengthLogic
) )
{ {
super(languageFilter);
this.documentLengthLogic = documentLengthLogic; this.documentLengthLogic = documentLengthLogic;
this.maxTitleLength = maxTitleLength; this.maxTitleLength = maxTitleLength;
this.sentenceExtractor = sentenceExtractor; this.sentenceExtractor = sentenceExtractor;

View File

@ -1,5 +1,6 @@
package nu.marginalia.converting.language; package nu.marginalia.converting.language;
import nu.marginalia.converting.util.TestLanguageModels;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -10,16 +11,10 @@ class LanguageFilterTest {
@Test @Test
void isPageInteresting() { void isPageInteresting() {
var languageFilter = new LanguageFilter(); var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels());
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html></html>")).orElse(true)); assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html></html>")).orElse(true));
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"en\"></html>")).orElse(false)); assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"en\"></html>")).orElse(false));
assertFalse(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"no\"></html>")).orElse(false)); assertFalse(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"no\"></html>")).orElse(false));
} }
@Test
public void isStringChinese() {
var languageFilter = new LanguageFilter();
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面木が生えているところは普賢岳の山体です今回の噴火にともないこのあたりの山体がマグマに押されて変形し写真では左にむかって100mほどせりだしました\n"));
}
} }

View File

@ -0,0 +1,38 @@
package nu.marginalia.converting.util;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
public static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("ngrams.bin"),
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
);
}
}

View File

@ -31,7 +31,8 @@ public class TestLanguageModels {
languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tokens.bin") languageModelsHome.resolve("opennlp-tokens.bin"),
languageModelsHome.resolve("lid.176.ftz")
); );
} }
} }

View File

@ -36,7 +36,7 @@ public class TermFrequencyExtractor {
var plan = new CrawlPlanLoader().load(Path.of(args[0])); var plan = new CrawlPlanLoader().load(Path.of(args[0]));
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels())); ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
LanguageFilter lf = new LanguageFilter(); LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);

View File

@ -26,6 +26,7 @@ download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
download_model model/lid.176.ftz https://s3-us-west-1.amazonaws.com/fasttext-vectors/supervised_models/lid.176.ftz
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP

View File

@ -153,7 +153,7 @@ dependencyResolutionManagement {
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0') library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0')
library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4') library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4')
library('fasttext','com.github.vinhkhuc','jfasttext').version('0.5')
library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32') library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32')
library('opencsv','com.opencsv','opencsv').version('5.6') library('opencsv','com.opencsv','opencsv').version('5.6')
library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0') library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0')
@ -184,7 +184,7 @@ dependencyResolutionManagement {
bundle('slf4j.test', ['slf4j.jdk14']) bundle('slf4j.test', ['slf4j.jdk14'])
bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot']) bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot'])
bundle('mariadb', ['mariadb-client', 'hikaricp']) bundle('mariadb', ['mariadb-client', 'hikaricp'])
bundle('nlp', ['stanford.corenlp', 'opennlp']) bundle('nlp', ['stanford.corenlp', 'opennlp', 'fasttext'])
bundle('selenium', ['selenium.chrome', 'selenium.java']) bundle('selenium', ['selenium.chrome', 'selenium.java'])
bundle('handlebars', ['handlebars', 'handlebars.markdown']) bundle('handlebars', ['handlebars', 'handlebars.markdown'])