mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(language-processing) Select the appropriate language filter
The incorrect filter was selected based on the provided parameter, this has been corrected.
This commit is contained in:
parent
22b35d5d91
commit
b812e96c6d
@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
|
||||
/** A language prediction model that uses a FastText model to predict the language of a document */
|
||||
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
|
||||
private final JFastText jft = new JFastText();
|
||||
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.language.filter;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
@ -8,8 +10,6 @@ import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@ -31,10 +31,10 @@ public class LanguageFilter {
|
||||
if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0;
|
||||
|
||||
if (LANGUAGE_DETECTION_MODEL_VERSION == 1) {
|
||||
return languagePredictionModel2.predictEnglish(dld);
|
||||
return languagePredictionModel1.predictEnglish(dld);
|
||||
}
|
||||
else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) {
|
||||
return languagePredictionModel1.predictEnglish(dld);
|
||||
return languagePredictionModel2.predictEnglish(dld);
|
||||
}
|
||||
else { // default is to run both models
|
||||
if (languagePredictionModel1.predictEnglish(dld) < 0.1)
|
||||
|
@ -8,10 +8,14 @@ import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
/** A simple language prediction model that uses a dictionary of English words
|
||||
* and requires that a certain fraction of the words in the document present in that
|
||||
* dictionary for the document to be considered English.
|
||||
* */
|
||||
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
|
||||
private static final Set<String> englishWords = new HashSet<>();
|
||||
|
||||
public UngaBungaLanguagePredictionModel() throws Exception {
|
||||
public UngaBungaLanguagePredictionModel() {
|
||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||
"Could not load word frequency table");
|
||||
var br = new BufferedReader(new InputStreamReader(resource))
|
||||
|
Loading…
Reference in New Issue
Block a user