mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(language) fasttext based language filter
This commit is contained in:
parent
4598c7f40f
commit
46d761f34f
@ -10,13 +10,21 @@ public class LanguageModels {
|
||||
public final Path posRules;
|
||||
public final Path posDict;
|
||||
public final Path openNLPTokenData;
|
||||
public final Path fasttextLanguageModel;
|
||||
|
||||
public LanguageModels(Path ngramBloomFilter, Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData) {
|
||||
public LanguageModels(Path ngramBloomFilter,
|
||||
Path termFrequencies,
|
||||
Path openNLPSentenceDetectionData,
|
||||
Path posRules,
|
||||
Path posDict,
|
||||
Path openNLPTokenData,
|
||||
Path fasttextLanguageModel) {
|
||||
this.ngramBloomFilter = ngramBloomFilter;
|
||||
this.termFrequencies = termFrequencies;
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
this.posRules = posRules;
|
||||
this.posDict = posDict;
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
}
|
||||
}
|
||||
|
@ -87,7 +87,8 @@ public class WmsaHome {
|
||||
home.resolve("model/opennlp-sentence.bin"),
|
||||
home.resolve("model/English.RDR"),
|
||||
home.resolve("model/English.DICT"),
|
||||
home.resolve("model/opennlp-tok.bin"));
|
||||
home.resolve("model/opennlp-tok.bin"),
|
||||
home.resolve("model/lid.176.ftz"));
|
||||
}
|
||||
|
||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -16,13 +16,15 @@ public class DocumentLanguageData {
|
||||
public final DocumentSentence[] sentences;
|
||||
public final DocumentSentence[] titleSentences;
|
||||
public final TObjectIntHashMap<String> wordCount;
|
||||
public final String text;
|
||||
|
||||
/** for test convenience */
|
||||
public static DocumentLanguageData empty() {
|
||||
return new DocumentLanguageData(
|
||||
new DocumentSentence[0],
|
||||
new DocumentSentence[0],
|
||||
new TObjectIntHashMap<>()
|
||||
new TObjectIntHashMap<>(),
|
||||
""
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -71,7 +71,7 @@ public class SentenceExtractor {
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
@ -79,7 +79,7 @@ public class SentenceExtractor {
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||
}
|
||||
|
||||
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||
|
@ -72,6 +72,7 @@ dependencies {
|
||||
implementation libs.zstd
|
||||
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.nlp
|
||||
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.converting.language;
|
||||
|
||||
import com.github.jfasttext.JFastText;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
|
||||
private final JFastText jft = new JFastText();
|
||||
|
||||
public FasttextLanguagePredictionModel(LanguageModels lm) throws Exception {
|
||||
jft.loadModel(lm.fasttextLanguageModel.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public double predictEnglish(DocumentLanguageData dld) {
|
||||
if ("__label__en".equals(jft.predict(dld.text))) {
|
||||
return 1.0;
|
||||
}
|
||||
return 0.;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPoorAccuracy() {
|
||||
return false;
|
||||
}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.converting.language;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -8,10 +10,6 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@ -20,48 +18,29 @@ public class LanguageFilter {
|
||||
|
||||
private static final Set<String> interestingLanguages = Set.of("en", "en-us", "en-gb", "eng", "english");
|
||||
|
||||
private static final Set<String> englishWords = new HashSet<>();
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageFilter.class);
|
||||
static {
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
) {
|
||||
for (;;) {
|
||||
String s = br.readLine();
|
||||
if (s == null) {
|
||||
break;
|
||||
}
|
||||
englishWords.add(s.toLowerCase());
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
private final LanguagePredictionModel languagePredictionModel;
|
||||
|
||||
|
||||
/** Returns the probability the language is in English */
|
||||
public double dictionaryAgreement(DocumentLanguageData dld) {
|
||||
Set<String> seenWords = new HashSet<>();
|
||||
int englishCount = 0;
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent.wordsLowerCase) {
|
||||
if (seenWords.add(word) && englishWords.contains(word)) {
|
||||
englishCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double englishAgreement = englishCount / (double) Math.min(seenWords.size(), englishWords.size());
|
||||
|
||||
logger.debug("Agreement: {}", englishAgreement);
|
||||
|
||||
return englishAgreement;
|
||||
return languagePredictionModel.predictEnglish(dld);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LanguageFilter() {
|
||||
@SneakyThrows
|
||||
public LanguageFilter(LanguageModels lm) {
|
||||
try {
|
||||
if (Boolean.getBoolean("disable-fasttext")) {
|
||||
languagePredictionModel = new UngaBungaLanguagePredictionModel();
|
||||
}
|
||||
else {
|
||||
languagePredictionModel = new FasttextLanguagePredictionModel(lm);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<Boolean> isPageInterestingByHtmlTag(Document parsed) {
|
||||
@ -72,20 +51,17 @@ public class LanguageFilter {
|
||||
.map(interestingLanguages::contains);
|
||||
}
|
||||
|
||||
public Optional<Boolean> isPageInterestingByMetaLanguage(Document parsed) {
|
||||
return parsed.getElementsByTag("meta").stream().filter(elem -> "content-language".equalsIgnoreCase(elem.attr("http-equiv")))
|
||||
.map(elem -> elem.attr("content"))
|
||||
.filter(s -> !s.isBlank())
|
||||
.map(String::toLowerCase)
|
||||
.map(interestingLanguages::contains)
|
||||
.findAny();
|
||||
}
|
||||
|
||||
public boolean isBlockedUnicodeRange(String data) {
|
||||
if (!languagePredictionModel.hasPoorAccuracy()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (var range: UnicodeRanges.values()) {
|
||||
if (range.test(data))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,11 @@
|
||||
package nu.marginalia.converting.language;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
public interface LanguagePredictionModel {
|
||||
/** Returns the probability the language is in English */
|
||||
double predictEnglish(DocumentLanguageData dld);
|
||||
|
||||
boolean hasPoorAccuracy();
|
||||
|
||||
}
|
@ -0,0 +1,51 @@
|
||||
package nu.marginalia.converting.language;
|
||||
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
|
||||
private static final Set<String> englishWords = new HashSet<>();
|
||||
|
||||
public UngaBungaLanguagePredictionModel() throws Exception {
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
) {
|
||||
for (;;) {
|
||||
String s = br.readLine();
|
||||
if (s == null) {
|
||||
break;
|
||||
}
|
||||
englishWords.add(s.toLowerCase());
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public double predictEnglish(DocumentLanguageData dld) {
|
||||
Set<String> seenWords = new HashSet<>();
|
||||
int englishCount = 0;
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent.wordsLowerCase) {
|
||||
if (seenWords.add(word) && englishWords.contains(word)) {
|
||||
englishCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return englishCount / (double) Math.min(seenWords.size(), englishWords.size());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPoorAccuracy() {
|
||||
return true;
|
||||
}
|
||||
}
|
@ -17,7 +17,10 @@ import java.net.URISyntaxException;
|
||||
import java.util.*;
|
||||
|
||||
public abstract class AbstractDocumentProcessorPlugin {
|
||||
protected LanguageFilter languageFilter = new LanguageFilter();
|
||||
protected LanguageFilter languageFilter;
|
||||
public AbstractDocumentProcessorPlugin(LanguageFilter languageFilter) {
|
||||
this.languageFilter = languageFilter;
|
||||
}
|
||||
|
||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
|
||||
public abstract boolean isApplicable(CrawledDocument doc);
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||
@ -64,6 +65,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
@Inject
|
||||
public HtmlDocumentProcessorPlugin(
|
||||
@Named("min-document-quality") Double minDocumentQuality,
|
||||
LanguageFilter languageFilter,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
FeatureExtractor featureExtractor,
|
||||
TitleExtractor titleExtractor,
|
||||
@ -74,6 +76,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||
HtmlProcessorSpecializations specializations)
|
||||
{
|
||||
super(languageFilter);
|
||||
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.minDocumentQuality = minDocumentQuality;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.converting.language.LanguageFilter;
|
||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
@ -38,11 +39,13 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
||||
|
||||
@Inject
|
||||
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||
LanguageFilter languageFilter,
|
||||
SentenceExtractor sentenceExtractor,
|
||||
DocumentKeywordExtractor keywordExtractor,
|
||||
DocumentLengthLogic documentLengthLogic
|
||||
)
|
||||
{
|
||||
super(languageFilter);
|
||||
this.documentLengthLogic = documentLengthLogic;
|
||||
this.maxTitleLength = maxTitleLength;
|
||||
this.sentenceExtractor = sentenceExtractor;
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.converting.language;
|
||||
|
||||
import nu.marginalia.converting.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
@ -10,16 +11,10 @@ class LanguageFilterTest {
|
||||
|
||||
@Test
|
||||
void isPageInteresting() {
|
||||
var languageFilter = new LanguageFilter();
|
||||
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels());
|
||||
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html></html>")).orElse(true));
|
||||
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"en\"></html>")).orElse(false));
|
||||
assertFalse(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"no\"></html>")).orElse(false));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void isStringChinese() {
|
||||
var languageFilter = new LanguageFilter();
|
||||
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n"));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.converting.util;
|
||||
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Optional;
|
||||
|
||||
public class TestLanguageModels {
|
||||
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||
|
||||
public static Path getLanguageModelsPath() {
|
||||
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||
.map(Path::of)
|
||||
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||
|
||||
if (!Files.isDirectory(languageModelsHome)) {
|
||||
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||
}
|
||||
return languageModelsHome;
|
||||
}
|
||||
|
||||
public static LanguageModels getLanguageModels() {
|
||||
|
||||
var languageModelsHome = getLanguageModelsPath();
|
||||
|
||||
return new LanguageModels(
|
||||
languageModelsHome.resolve("ngrams.bin"),
|
||||
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||
languageModelsHome.resolve("English.RDR"),
|
||||
languageModelsHome.resolve("English.DICT"),
|
||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
||||
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||
languageModelsHome.resolve("lid.176.ftz")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ public class TermFrequencyExtractor {
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||
LanguageFilter lf = new LanguageFilter();
|
||||
LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
|
||||
|
||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||
|
||||
|
@ -26,6 +26,7 @@ download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache
|
||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
|
||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
||||
download_model model/lid.176.ftz https://s3-us-west-1.amazonaws.com/fasttext-vectors/supervised_models/lid.176.ftz
|
||||
|
||||
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||
|
@ -153,7 +153,7 @@ dependencyResolutionManagement {
|
||||
|
||||
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0')
|
||||
library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4')
|
||||
|
||||
library('fasttext','com.github.vinhkhuc','jfasttext').version('0.5')
|
||||
library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32')
|
||||
library('opencsv','com.opencsv','opencsv').version('5.6')
|
||||
library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0')
|
||||
@ -184,7 +184,7 @@ dependencyResolutionManagement {
|
||||
bundle('slf4j.test', ['slf4j.jdk14'])
|
||||
bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot'])
|
||||
bundle('mariadb', ['mariadb-client', 'hikaricp'])
|
||||
bundle('nlp', ['stanford.corenlp', 'opennlp'])
|
||||
bundle('nlp', ['stanford.corenlp', 'opennlp', 'fasttext'])
|
||||
bundle('selenium', ['selenium.chrome', 'selenium.java'])
|
||||
bundle('handlebars', ['handlebars', 'handlebars.markdown'])
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user