diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java index 5eca3c76..60a4ac87 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java @@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText; import nu.marginalia.LanguageModels; import nu.marginalia.language.model.DocumentLanguageData; +/** A language prediction model that uses a FastText model to predict the language of a document */ public class FasttextLanguagePredictionModel implements LanguagePredictionModel { private final JFastText jft = new JFastText(); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java index bf390e45..12dd45f9 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java @@ -1,5 +1,7 @@ package nu.marginalia.language.filter; +import com.google.inject.Inject; +import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.encoding.UnicodeRanges; @@ -8,8 +10,6 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.inject.Inject; -import com.google.inject.Singleton; import java.util.Optional; import java.util.Set; @@ -31,10 +31,10 @@ public class LanguageFilter { if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0; if (LANGUAGE_DETECTION_MODEL_VERSION == 1) { - return languagePredictionModel2.predictEnglish(dld); + return languagePredictionModel1.predictEnglish(dld); } else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) { - return languagePredictionModel1.predictEnglish(dld); + return languagePredictionModel2.predictEnglish(dld); } else { // default is to run both models if (languagePredictionModel1.predictEnglish(dld) < 0.1) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java index 8b3c4567..b27c1aaf 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java @@ -8,10 +8,14 @@ import java.util.HashSet; import java.util.Objects; import java.util.Set; +/** A simple language prediction model that uses a dictionary of English words + * and requires that a certain fraction of the words in the document present in that + * dictionary for the document to be considered English. + * */ public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel { private static final Set englishWords = new HashSet<>(); - public UngaBungaLanguagePredictionModel() throws Exception { + public UngaBungaLanguagePredictionModel() { try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"), "Could not load word frequency table"); var br = new BufferedReader(new InputStreamReader(resource))