mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(language-processing) Select the appropriate language filter
The incorrect filter was selected based on the provided parameter, this has been corrected.
This commit is contained in:
parent
22b35d5d91
commit
b812e96c6d
@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText;
|
|||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.model.DocumentLanguageData;
|
import nu.marginalia.language.model.DocumentLanguageData;
|
||||||
|
|
||||||
|
/** A language prediction model that uses a FastText model to predict the language of a document */
|
||||||
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
|
public class FasttextLanguagePredictionModel implements LanguagePredictionModel {
|
||||||
private final JFastText jft = new JFastText();
|
private final JFastText jft = new JFastText();
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.language.filter;
|
package nu.marginalia.language.filter;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||||
@ -8,8 +10,6 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -31,10 +31,10 @@ public class LanguageFilter {
|
|||||||
if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0;
|
if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0;
|
||||||
|
|
||||||
if (LANGUAGE_DETECTION_MODEL_VERSION == 1) {
|
if (LANGUAGE_DETECTION_MODEL_VERSION == 1) {
|
||||||
return languagePredictionModel2.predictEnglish(dld);
|
return languagePredictionModel1.predictEnglish(dld);
|
||||||
}
|
}
|
||||||
else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) {
|
else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) {
|
||||||
return languagePredictionModel1.predictEnglish(dld);
|
return languagePredictionModel2.predictEnglish(dld);
|
||||||
}
|
}
|
||||||
else { // default is to run both models
|
else { // default is to run both models
|
||||||
if (languagePredictionModel1.predictEnglish(dld) < 0.1)
|
if (languagePredictionModel1.predictEnglish(dld) < 0.1)
|
||||||
|
@ -8,10 +8,14 @@ import java.util.HashSet;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
/** A simple language prediction model that uses a dictionary of English words
|
||||||
|
* and requires that a certain fraction of the words in the document present in that
|
||||||
|
* dictionary for the document to be considered English.
|
||||||
|
* */
|
||||||
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
|
public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel {
|
||||||
private static final Set<String> englishWords = new HashSet<>();
|
private static final Set<String> englishWords = new HashSet<>();
|
||||||
|
|
||||||
public UngaBungaLanguagePredictionModel() throws Exception {
|
public UngaBungaLanguagePredictionModel() {
|
||||||
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"),
|
||||||
"Could not load word frequency table");
|
"Could not load word frequency table");
|
||||||
var br = new BufferedReader(new InputStreamReader(resource))
|
var br = new BufferedReader(new InputStreamReader(resource))
|
||||||
|
Loading…
Reference in New Issue
Block a user