mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(language) fasttext based language filter
This commit is contained in:
parent
4598c7f40f
commit
46d761f34f
@ -10,13 +10,21 @@ public class LanguageModels {
|
|||||||
public final Path posRules;
|
public final Path posRules;
|
||||||
public final Path posDict;
|
public final Path posDict;
|
||||||
public final Path openNLPTokenData;
|
public final Path openNLPTokenData;
|
||||||
|
public final Path fasttextLanguageModel;
|
||||||
|
|
||||||
public LanguageModels(Path ngramBloomFilter, Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData) {
|
public LanguageModels(Path ngramBloomFilter,
|
||||||
|
Path termFrequencies,
|
||||||
|
Path openNLPSentenceDetectionData,
|
||||||
|
Path posRules,
|
||||||
|
Path posDict,
|
||||||
|
Path openNLPTokenData,
|
||||||
|
Path fasttextLanguageModel) {
|
||||||
this.ngramBloomFilter = ngramBloomFilter;
|
this.ngramBloomFilter = ngramBloomFilter;
|
||||||
this.termFrequencies = termFrequencies;
|
this.termFrequencies = termFrequencies;
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||||
this.posRules = posRules;
|
this.posRules = posRules;
|
||||||
this.posDict = posDict;
|
this.posDict = posDict;
|
||||||
this.openNLPTokenData = openNLPTokenData;
|
this.openNLPTokenData = openNLPTokenData;
|
||||||
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -87,7 +87,8 @@ public class WmsaHome {
|
|||||||
home.resolve("model/opennlp-sentence.bin"),
|
home.resolve("model/opennlp-sentence.bin"),
|
||||||
home.resolve("model/English.RDR"),
|
home.resolve("model/English.RDR"),
|
||||||
home.resolve("model/English.DICT"),
|
home.resolve("model/English.DICT"),
|
||||||
home.resolve("model/opennlp-tok.bin"));
|
home.resolve("model/opennlp-tok.bin"),
|
||||||
|
home.resolve("model/lid.176.ftz"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||||
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
|||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
|
languageModelsHome.resolve("lid.176.ftz")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
|||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
|
languageModelsHome.resolve("lid.176.ftz")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,13 +16,15 @@ public class DocumentLanguageData {
|
|||||||
public final DocumentSentence[] sentences;
|
public final DocumentSentence[] sentences;
|
||||||
public final DocumentSentence[] titleSentences;
|
public final DocumentSentence[] titleSentences;
|
||||||
public final TObjectIntHashMap<String> wordCount;
|
public final TObjectIntHashMap<String> wordCount;
|
||||||
|
public final String text;
|
||||||
|
|
||||||
/** for test convenience */
|
/** for test convenience */
|
||||||
public static DocumentLanguageData empty() {
|
public static DocumentLanguageData empty() {
|
||||||
return new DocumentLanguageData(
|
return new DocumentLanguageData(
|
||||||
new DocumentSentence[0],
|
new DocumentSentence[0],
|
||||||
new DocumentSentence[0],
|
new DocumentSentence[0],
|
||||||
new TObjectIntHashMap<>()
|
new TObjectIntHashMap<>(),
|
||||||
|
""
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ public class SentenceExtractor {
|
|||||||
|
|
||||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentLanguageData extractSentences(String text, String title) {
|
public DocumentLanguageData extractSentences(String text, String title) {
|
||||||
@ -79,7 +79,7 @@ public class SentenceExtractor {
|
|||||||
|
|
||||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
return new DocumentLanguageData(textSentences, titleSentences, counts, text);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||||
|
@ -72,6 +72,7 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
|
||||||
implementation libs.bundles.mariadb
|
implementation libs.bundles.mariadb
|
||||||
|
implementation libs.bundles.nlp
|
||||||
|
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.converting.language;
|
||||||
|
|
||||||
|
import com.github.jfasttext.JFastText;
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
||||||
|
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
|
||||||
|
private final JFastText jft = new JFastText();
|
||||||
|
|
||||||
|
public FasttextLanguagePredictionModel(LanguageModels lm) throws Exception {
|
||||||
|
jft.loadModel(lm.fasttextLanguageModel.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double predictEnglish(DocumentLanguageData dld) {
|
||||||
|
if ("__label__en".equals(jft.predict(dld.text))) {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
return 0.;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPoorAccuracy() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.converting.language;
|
package nu.marginalia.converting.language;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -8,10 +10,6 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import javax.inject.Singleton;
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -20,48 +18,29 @@ public class LanguageFilter {
|
|||||||
|
|
||||||
private static final Set<String> interestingLanguages = Set.of("en", "en-us", "en-gb", "eng", "english");
|
private static final Set<String> interestingLanguages = Set.of("en", "en-us", "en-gb", "eng", "english");
|
||||||
|
|
||||||
private static final Set<String> englishWords = new HashSet<>();
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LanguageFilter.class);
|
private static final Logger logger = LoggerFactory.getLogger(LanguageFilter.class);
|
||||||
static {
|
|
||||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
|
||||||
"Could not load word frequency table");
|
|
||||||
var br = new BufferedReader(new InputStreamReader(resource))
|
|
||||||
) {
|
|
||||||
for (;;) {
|
|
||||||
String s = br.readLine();
|
|
||||||
if (s == null) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
englishWords.add(s.toLowerCase());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
throw new RuntimeException(ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
private final LanguagePredictionModel languagePredictionModel;
|
||||||
|
|
||||||
|
|
||||||
|
/** Returns the probability the language is in English */
|
||||||
public double dictionaryAgreement(DocumentLanguageData dld) {
|
public double dictionaryAgreement(DocumentLanguageData dld) {
|
||||||
Set<String> seenWords = new HashSet<>();
|
return languagePredictionModel.predictEnglish(dld);
|
||||||
int englishCount = 0;
|
|
||||||
|
|
||||||
for (var sent : dld.sentences) {
|
|
||||||
for (var word : sent.wordsLowerCase) {
|
|
||||||
if (seenWords.add(word) && englishWords.contains(word)) {
|
|
||||||
englishCount++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
double englishAgreement = englishCount / (double) Math.min(seenWords.size(), englishWords.size());
|
|
||||||
|
|
||||||
logger.debug("Agreement: {}", englishAgreement);
|
|
||||||
|
|
||||||
return englishAgreement;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public LanguageFilter() {
|
@SneakyThrows
|
||||||
|
public LanguageFilter(LanguageModels lm) {
|
||||||
|
try {
|
||||||
|
if (Boolean.getBoolean("disable-fasttext")) {
|
||||||
|
languagePredictionModel = new UngaBungaLanguagePredictionModel();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
languagePredictionModel = new FasttextLanguagePredictionModel(lm);
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<Boolean> isPageInterestingByHtmlTag(Document parsed) {
|
public Optional<Boolean> isPageInterestingByHtmlTag(Document parsed) {
|
||||||
@ -72,20 +51,17 @@ public class LanguageFilter {
|
|||||||
.map(interestingLanguages::contains);
|
.map(interestingLanguages::contains);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<Boolean> isPageInterestingByMetaLanguage(Document parsed) {
|
|
||||||
return parsed.getElementsByTag("meta").stream().filter(elem -> "content-language".equalsIgnoreCase(elem.attr("http-equiv")))
|
|
||||||
.map(elem -> elem.attr("content"))
|
|
||||||
.filter(s -> !s.isBlank())
|
|
||||||
.map(String::toLowerCase)
|
|
||||||
.map(interestingLanguages::contains)
|
|
||||||
.findAny();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isBlockedUnicodeRange(String data) {
|
public boolean isBlockedUnicodeRange(String data) {
|
||||||
|
if (!languagePredictionModel.hasPoorAccuracy()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (var range: UnicodeRanges.values()) {
|
for (var range: UnicodeRanges.values()) {
|
||||||
if (range.test(data))
|
if (range.test(data))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
package nu.marginalia.converting.language;
|
||||||
|
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
||||||
|
public interface LanguagePredictionModel {
|
||||||
|
/** Returns the probability the language is in English */
|
||||||
|
double predictEnglish(DocumentLanguageData dld);
|
||||||
|
|
||||||
|
boolean hasPoorAccuracy();
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,51 @@
|
|||||||
|
package nu.marginalia.converting.language;
|
||||||
|
|
||||||
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
|
||||||
|
private static final Set<String> englishWords = new HashSet<>();
|
||||||
|
|
||||||
|
public UngaBungaLanguagePredictionModel() throws Exception {
|
||||||
|
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||||
|
"Could not load word frequency table");
|
||||||
|
var br = new BufferedReader(new InputStreamReader(resource))
|
||||||
|
) {
|
||||||
|
for (;;) {
|
||||||
|
String s = br.readLine();
|
||||||
|
if (s == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
englishWords.add(s.toLowerCase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public double predictEnglish(DocumentLanguageData dld) {
|
||||||
|
Set<String> seenWords = new HashSet<>();
|
||||||
|
int englishCount = 0;
|
||||||
|
|
||||||
|
for (var sent : dld.sentences) {
|
||||||
|
for (var word : sent.wordsLowerCase) {
|
||||||
|
if (seenWords.add(word) && englishWords.contains(word)) {
|
||||||
|
englishCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return englishCount / (double) Math.min(seenWords.size(), englishWords.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPoorAccuracy() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
@ -17,7 +17,10 @@ import java.net.URISyntaxException;
|
|||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
public abstract class AbstractDocumentProcessorPlugin {
|
public abstract class AbstractDocumentProcessorPlugin {
|
||||||
protected LanguageFilter languageFilter = new LanguageFilter();
|
protected LanguageFilter languageFilter;
|
||||||
|
public AbstractDocumentProcessorPlugin(LanguageFilter languageFilter) {
|
||||||
|
this.languageFilter = languageFilter;
|
||||||
|
}
|
||||||
|
|
||||||
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
|
public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException;
|
||||||
public abstract boolean isApplicable(CrawledDocument doc);
|
public abstract boolean isApplicable(CrawledDocument doc);
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.language.LanguageFilter;
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.processor.MetaRobotsTag;
|
import nu.marginalia.converting.processor.MetaRobotsTag;
|
||||||
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor;
|
||||||
@ -64,6 +65,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
@Inject
|
@Inject
|
||||||
public HtmlDocumentProcessorPlugin(
|
public HtmlDocumentProcessorPlugin(
|
||||||
@Named("min-document-quality") Double minDocumentQuality,
|
@Named("min-document-quality") Double minDocumentQuality,
|
||||||
|
LanguageFilter languageFilter,
|
||||||
SentenceExtractor sentenceExtractor,
|
SentenceExtractor sentenceExtractor,
|
||||||
FeatureExtractor featureExtractor,
|
FeatureExtractor featureExtractor,
|
||||||
TitleExtractor titleExtractor,
|
TitleExtractor titleExtractor,
|
||||||
@ -74,6 +76,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
DocumentGeneratorExtractor documentGeneratorExtractor,
|
DocumentGeneratorExtractor documentGeneratorExtractor,
|
||||||
HtmlProcessorSpecializations specializations)
|
HtmlProcessorSpecializations specializations)
|
||||||
{
|
{
|
||||||
|
super(languageFilter);
|
||||||
|
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.minDocumentQuality = minDocumentQuality;
|
this.minDocumentQuality = minDocumentQuality;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor.plugin;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.converting.language.LanguageFilter;
|
||||||
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
import nu.marginalia.converting.processor.logic.DocumentLengthLogic;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
@ -38,11 +39,13 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
public PlainTextDocumentProcessorPlugin(@Named("max-title-length") Integer maxTitleLength,
|
||||||
|
LanguageFilter languageFilter,
|
||||||
SentenceExtractor sentenceExtractor,
|
SentenceExtractor sentenceExtractor,
|
||||||
DocumentKeywordExtractor keywordExtractor,
|
DocumentKeywordExtractor keywordExtractor,
|
||||||
DocumentLengthLogic documentLengthLogic
|
DocumentLengthLogic documentLengthLogic
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
super(languageFilter);
|
||||||
this.documentLengthLogic = documentLengthLogic;
|
this.documentLengthLogic = documentLengthLogic;
|
||||||
this.maxTitleLength = maxTitleLength;
|
this.maxTitleLength = maxTitleLength;
|
||||||
this.sentenceExtractor = sentenceExtractor;
|
this.sentenceExtractor = sentenceExtractor;
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.converting.language;
|
package nu.marginalia.converting.language;
|
||||||
|
|
||||||
|
import nu.marginalia.converting.util.TestLanguageModels;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
@ -10,16 +11,10 @@ class LanguageFilterTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void isPageInteresting() {
|
void isPageInteresting() {
|
||||||
var languageFilter = new LanguageFilter();
|
var languageFilter = new LanguageFilter(TestLanguageModels.getLanguageModels());
|
||||||
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html></html>")).orElse(true));
|
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html></html>")).orElse(true));
|
||||||
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"en\"></html>")).orElse(false));
|
assertTrue(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"en\"></html>")).orElse(false));
|
||||||
assertFalse(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"no\"></html>")).orElse(false));
|
assertFalse(languageFilter.isPageInterestingByHtmlTag(Jsoup.parse("<html lang=\"no\"></html>")).orElse(false));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void isStringChinese() {
|
|
||||||
var languageFilter = new LanguageFilter();
|
|
||||||
assertTrue(languageFilter.isBlockedUnicodeRange("溶岩ドームの手前に広がる斜面(木が生えているところ)は普賢岳の山体です.今回の噴火にともない,このあたりの山体がマグマに押されて変形し,北(写真では左)にむかって100mほどせりだしました\n"));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,38 @@
|
|||||||
|
package nu.marginalia.converting.util;
|
||||||
|
|
||||||
|
import nu.marginalia.LanguageModels;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class TestLanguageModels {
|
||||||
|
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
|
||||||
|
|
||||||
|
public static Path getLanguageModelsPath() {
|
||||||
|
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
|
||||||
|
.map(Path::of)
|
||||||
|
.orElse(LANGUAGE_MODELS_DEFAULT);
|
||||||
|
|
||||||
|
if (!Files.isDirectory(languageModelsHome)) {
|
||||||
|
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
|
||||||
|
}
|
||||||
|
return languageModelsHome;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static LanguageModels getLanguageModels() {
|
||||||
|
|
||||||
|
var languageModelsHome = getLanguageModelsPath();
|
||||||
|
|
||||||
|
return new LanguageModels(
|
||||||
|
languageModelsHome.resolve("ngrams.bin"),
|
||||||
|
languageModelsHome.resolve("tfreq-new-algo3.bin"),
|
||||||
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
|
languageModelsHome.resolve("English.RDR"),
|
||||||
|
languageModelsHome.resolve("English.DICT"),
|
||||||
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
|
languageModelsHome.resolve("lid.176.ftz")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -31,7 +31,8 @@ public class TestLanguageModels {
|
|||||||
languageModelsHome.resolve("opennlp-sentence.bin"),
|
languageModelsHome.resolve("opennlp-sentence.bin"),
|
||||||
languageModelsHome.resolve("English.RDR"),
|
languageModelsHome.resolve("English.RDR"),
|
||||||
languageModelsHome.resolve("English.DICT"),
|
languageModelsHome.resolve("English.DICT"),
|
||||||
languageModelsHome.resolve("opennlp-tokens.bin")
|
languageModelsHome.resolve("opennlp-tokens.bin"),
|
||||||
|
languageModelsHome.resolve("lid.176.ftz")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,7 @@ public class TermFrequencyExtractor {
|
|||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||||
LanguageFilter lf = new LanguageFilter();
|
LanguageFilter lf = new LanguageFilter(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||||
|
|
||||||
|
@ -26,6 +26,7 @@ download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache
|
|||||||
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin
|
||||||
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
|
download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin
|
||||||
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin
|
||||||
|
download_model model/lid.176.ftz https://s3-us-west-1.amazonaws.com/fasttext-vectors/supervised_models/lid.176.ftz
|
||||||
|
|
||||||
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
|
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||||
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
|
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
|
||||||
|
@ -153,7 +153,7 @@ dependencyResolutionManagement {
|
|||||||
|
|
||||||
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0')
|
library('stanford.corenlp','edu.stanford.nlp','stanford-corenlp').version('4.4.0')
|
||||||
library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4')
|
library('opennlp','org.apache.opennlp','opennlp-tools').version('1.9.4')
|
||||||
|
library('fasttext','com.github.vinhkhuc','jfasttext').version('0.5')
|
||||||
library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32')
|
library('roaringbitmap','org.roaringbitmap','RoaringBitmap').version('0.9.32')
|
||||||
library('opencsv','com.opencsv','opencsv').version('5.6')
|
library('opencsv','com.opencsv','opencsv').version('5.6')
|
||||||
library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0')
|
library('bucket4j','com.github.vladimir-bukhtoyarov','bucket4j-core').version('7.5.0')
|
||||||
@ -184,7 +184,7 @@ dependencyResolutionManagement {
|
|||||||
bundle('slf4j.test', ['slf4j.jdk14'])
|
bundle('slf4j.test', ['slf4j.jdk14'])
|
||||||
bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot'])
|
bundle('prometheus', ['prometheus', 'prometheus-servlet', 'prometheus-server', 'prometheus-hotspot'])
|
||||||
bundle('mariadb', ['mariadb-client', 'hikaricp'])
|
bundle('mariadb', ['mariadb-client', 'hikaricp'])
|
||||||
bundle('nlp', ['stanford.corenlp', 'opennlp'])
|
bundle('nlp', ['stanford.corenlp', 'opennlp', 'fasttext'])
|
||||||
bundle('selenium', ['selenium.chrome', 'selenium.java'])
|
bundle('selenium', ['selenium.chrome', 'selenium.java'])
|
||||||
bundle('handlebars', ['handlebars', 'handlebars.markdown'])
|
bundle('handlebars', ['handlebars', 'handlebars.markdown'])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user