(language-processing) Select the appropriate language filter

The incorrect filter was selected based on the provided parameter, this has been corrected.
This commit is contained in:
Viktor Lofgren 2024-07-19 12:22:32 +02:00
parent 22b35d5d91
commit b812e96c6d
3 changed files with 10 additions and 5 deletions

View File

@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.model.DocumentLanguageData;
/** A language prediction model that uses a FastText model to predict the language of a document */
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
private final JFastText jft = new JFastText();

View File

@ -1,5 +1,7 @@
package nu.marginalia.language.filter;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels;
import nu.marginalia.language.encoding.UnicodeRanges;
@ -8,8 +10,6 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.util.Optional;
import java.util.Set;
@ -31,10 +31,10 @@ public class LanguageFilter {
if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0;
if (LANGUAGE_DETECTION_MODEL_VERSION == 1) {
return languagePredictionModel2.predictEnglish(dld);
return languagePredictionModel1.predictEnglish(dld);
}
else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) {
return languagePredictionModel1.predictEnglish(dld);
return languagePredictionModel2.predictEnglish(dld);
}
else { // default is to run both models
if (languagePredictionModel1.predictEnglish(dld) < 0.1)

View File

@ -8,10 +8,14 @@ import java.util.HashSet;
import java.util.Objects;
import java.util.Set;
/** A simple language prediction model that uses a dictionary of English words
* and requires that a certain fraction of the words in the document present in that
* dictionary for the document to be considered English.
* */
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
private static final Set<String> englishWords = new HashSet<>();
public UngaBungaLanguagePredictionModel() throws Exception {
public UngaBungaLanguagePredictionModel() {
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))