Added test util for the tests to remove hard coding of LanguageModels.

This commit is contained in:
vlofgren 2022-05-19 18:05:10 +02:00
parent c24b978c51
commit 74ae97f8f4
16 changed files with 74 additions and 113 deletions

15
doc/language-models.md Normal file
View File

@ -0,0 +1,15 @@
# Language Models
## For Tests
Many tests require language models to work,
download them from [https://downloads.marginalia.nu/](https://downloads.marginalia.nu/),
and put them somewhere. Then set the environment
variable ```LANGUAGE_MODELS_HOME``` to point to this directory.
Alternatively, patch ```nu.marginalia.util.TestLanguageModels``` to
default to where you've put them.
## For Production
TBW

View File

@ -0,0 +1,31 @@
package nu.marginalia.util;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = Path.of("/home/vlofgren/Work/ngrams/");
public static LanguageModels getLanguageModels() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return new LanguageModels(
languageModelsHome.resolve("ngrams-generous-emstr.bin"),
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
languageModelsHome.resolve("opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("opennlp-tok.bin")
);
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.assistant.suggest;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
@ -14,14 +15,7 @@ class SuggestionsTest {
@BeforeAll
public static void setUp() {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo3.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
suggestions = new Suggestions(Path.of("/home/vlofgren/Work/sql-titles-clean"),
new SpellChecker(), new NGramDict(lm));
}

View File

@ -4,6 +4,7 @@ import com.zaxxer.hikari.HikariDataSource;
import io.reactivex.rxjava3.exceptions.UndeliverableException;
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
@ -150,14 +151,7 @@ class DomainCrawlerTest {
languageFilter = new LanguageFilter();
var lm = new LanguageModels(
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
);
var lm = TestLanguageModels.getLanguageModels();
var ke = new DocumentKeywordExtractor(new NGramDict(lm));
var se = new SentenceExtractor(lm);

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.crawler.domain;
import com.opencsv.exceptions.CsvValidationException;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.archive.client.ArchiveClient;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.LanguageFilter;
@ -42,14 +43,7 @@ class DomainCrawlerTest2 {
var ingress = new EdgeIndexTask(new EdgeDomain("memex.marginalia.nu"), 0, 10, 1.);
ingress.urls.add(new EdgeUrl("https://memex.marginalia.nu/"));
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-tok.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
var dict = new NGramDict(lm);
HtmlProcessor processor = new HtmlProcessor(new DocumentKeywordExtractor(dict),new SentenceExtractor(lm));

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.crawler.domain.language.processing;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.model.WordRep;
@ -28,14 +29,7 @@ import java.util.regex.Pattern;
class SentenceExtractorTest {
SentenceExtractor newSe;
SentenceExtractor legacySe;
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeEach
public void setUp() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.crawler.domain.processor;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.DocumentKeywordExtractor;
@ -24,14 +25,7 @@ import java.util.List;
class HtmlProcessorTest {
Logger logger = LoggerFactory.getLogger(getClass());
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-sentence.bin"),
Path.of("/var/lib/wmsa/model/English.RDR"),
Path.of("/var/lib/wmsa/model/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-tok.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
HtmlProcessor processor = new HtmlProcessor(new DocumentKeywordExtractor(new NGramDict(lm)),new SentenceExtractor(lm));
@Test

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.index.service;
import com.opencsv.exceptions.CsvValidationException;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.configuration.server.Initialization;
@ -105,14 +106,7 @@ public class EdgeSearchTest {
static Initialization init = new Initialization();
private QueryParser parser;
private static NGramDict dict;
private static LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo3.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
private static LanguageModels lm = TestLanguageModels.getLanguageModels();
@SneakyThrows

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index.service;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.util.TestUtil;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
@ -69,15 +70,7 @@ public class EdgeSearchTestLocal {
static Initialization init = new Initialization();
private QueryParser parser;
private static NGramDict dict;
private static LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo3.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
private static LanguageModels lm = TestLanguageModels.getLanguageModels();
@SneakyThrows
@BeforeAll

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.integration.arxiv;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.DocumentKeywordExtractor;
@ -17,14 +18,7 @@ import static org.junit.jupiter.api.Assertions.*;
@Disabled // this isn't used and the test is hella slow
class ArxivParserTest {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
@Test
void parse() throws IOException {

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.integration.stackoverflow;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.DocumentKeywordExtractor;
@ -16,14 +17,7 @@ import java.io.IOException;
import java.nio.file.Path;
public class StackOverflowPostsTest {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
@Test
public void test() throws IOException, ParserConfigurationException, SAXException, InterruptedException {

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.integration.wikipedia;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.DocumentDebugger;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
@ -17,14 +18,7 @@ import java.io.IOException;
import java.nio.file.Path;
public class WikipediaTest {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
@Test @SneakyThrows
public void test() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import org.junit.BeforeClass;
@ -15,14 +16,7 @@ class BodyQueryParserTest {
private QueryParser parser;
private static NGramDict dict;
private static EnglishDictionary englishDictionary;
private static LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
private static LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeClass
public static void init() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import org.junit.jupiter.api.Test;
@ -12,14 +13,7 @@ class EnglishDictionaryTest {
@Test
void getWordVariants() {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
var dict = new NGramDict(lm);
new EnglishDictionary(dict).getWordVariants("dos").forEach(System.out::println);

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import org.junit.BeforeClass;
@ -13,14 +14,7 @@ class QueryParserTest {
private QueryParser parser;
private static NGramDict dict;
private static EnglishDictionary englishDictionary;
private static LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
private static LanguageModels lm = TestLanguageModels.getLanguageModels();
@BeforeEach
public void setUp() {

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.crawler.domain.language.processing.SentenceExtractor;
@ -15,14 +16,7 @@ class QueryVariantsTest {
SentenceExtractor se;
@BeforeEach
public void setUp() {
LanguageModels lm = new LanguageModels(
Path.of("/home/vlofgren/Work/ngrams/ngrams-generous-emstr.bin"),
Path.of("/home/vlofgren/Work/ngrams/tfreq-new-algo4.bin"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
Path.of("/home/vlofgren/Work/ngrams/English.RDR"),
Path.of("/home/vlofgren/Work/ngrams/English.DICT"),
Path.of("/home/vlofgren/Work/ngrams/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin")
);
LanguageModels lm = TestLanguageModels.getLanguageModels();
se = new SentenceExtractor(lm);