diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index 5ff1249a..20103f15 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge; import nu.marginalia.util.test.TestUtil; -import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; import org.jsoup.Jsoup; import org.junit.jupiter.api.Tag; @@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; import org.testcontainers.utility.MountableFile; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -28,7 +26,6 @@ import java.util.ArrayList; import java.util.List; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; -import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL; @Tag("e2e") @Testcontainers diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java index ea18b2a8..03bc774e 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EncyclopediaE2ETest.java @@ -3,11 +3,21 @@ package nu.marginalia.wmsa.edge; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.chrome.ChromeOptions; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.BindMode; +import org.testcontainers.containers.BrowserWebDriverContainer; import org.testcontainers.containers.GenericContainer; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.containers.Network; +import org.testcontainers.containers.NginxContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; import org.testcontainers.junit.jupiter.Container; import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.nio.file.Path; +import java.time.Duration; import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA; @@ -19,9 +29,42 @@ public class EncyclopediaE2ETest extends E2ETestBase { @Container public GenericContainer encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB); + @Container + public GenericContainer encyclopediaLoader = new GenericContainer<>("openjdk:17-alpine") + .dependsOn(encyclopediaContainer) + .dependsOn(mariaDB) + .withNetwork(network) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader"))) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh") + .withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY) + .withCommand("sh", "load-encyclopedia.sh") + .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); + @Container + public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") + .dependsOn(encyclopediaLoader) + .dependsOn(encyclopediaContainer) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) + .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf") + .withNetwork(network) + .withNetworkAliases("proxyNginx"); + + @Container + public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() + .withNetwork(network) + .withCapabilities(new ChromeOptions()); + + private Path getModelData() { + return Path.of(System.getProperty("user.dir")).resolve("data/test"); + } @Test public void run() { + var driver = chrome.getWebDriver(); + + driver.get("http://proxyNginx/wiki/Frog"); + System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); } } diff --git a/marginalia_nu/src/e2e/resources/load-encyclopedia.sh b/marginalia_nu/src/e2e/resources/load-encyclopedia.sh new file mode 100644 index 00000000..9700f0de --- /dev/null +++ b/marginalia_nu/src/e2e/resources/load-encyclopedia.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +mkdir -p /var/lib/wmsa/conf/ +mkdir -p /var/lib/wmsa/data/ +mkdir -p /data + +cat > /var/lib/wmsa/conf/db.properties < /var/lib/wmsa/conf/hosts < { + if (art != null) { + try { + sem.acquire(); + + pool.execute(() -> { + try { + convert(url, art); + } finally { + sem.release(); + } + }); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + }, p -> true); + + sem.acquire(12); + + encyclopediaClient.close(); + } + + private static void convert(String url, String art) { + String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art); + + if (null != newData) { + encyclopediaClient.submitWiki(Context.internal(), url, newData) + .retry(5) + .blockingSubscribe(); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java deleted file mode 100644 index 1b35dc12..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/ZimConverterMain.java +++ /dev/null @@ -1,211 +0,0 @@ -package nu.marginalia.wmsa.edge.tools; - -import lombok.AllArgsConstructor; -import lombok.SneakyThrows; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; -import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner; -import nu.marginalia.util.language.conf.LanguageModels; -import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient; -import org.jsoup.Jsoup; -import org.openzim.ZIMTypes.ZIMFile; -import org.openzim.ZIMTypes.ZIMReader; - -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.LinkedBlockingQueue; - -public class ZimConverterMain { - - static final LinkedBlockingQueue jobQueue = new LinkedBlockingQueue<>(100); - static final LinkedBlockingQueue analysisQueue = new LinkedBlockingQueue<>(100); - static boolean hasData = true; - static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient(); - static NGramDict dict = new NGramDict(new LanguageModels( - Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"), - Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"), - Path.of("/var/lib/wmsa/model/English.RDR"), - Path.of("/var/lib/wmsa/model/English.DICT"), - Path.of("/var/lib/wmsa/model/opennlp-tok.bin") - ) - ); - public void extractUrlList() throws IOException { - var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - var urlList = zr.getURLListByURL(); - - try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) { - zr.forEachTitles( - ae -> { - pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\"")); - }, - re -> { - pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\"")); - } - ); - } - } - - public static void main(String[] args) throws IOException { -// convertJust("Aleph_number"); -// convertJust("Floyd–Steinberg_dithering"); -// convertJust("Laplace's_equation"); -// convertJust("John_Fahey"); -// convertJust("Plotinus"); -// convertJust("C++"); - convertAll(args); - encyclopediaClient.close(); - } - - @SneakyThrows - private static void convertJust(String url) { - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, - Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html"))); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData); - } - - private static void extractOne(String which, int clusterId) throws IOException { -// var zr = new ZIMReader(new ZIMFile(args[1])); - var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - int[] cluster = new int[] { clusterId }; - if (clusterId == -1) { - zr.forEachTitles(ae -> { - if (ae.getUrl().equals(which)) { - System.err.print(ae.getUrl() + " " + ae.getClusterNumber()); - cluster[0] = ae.getClusterNumber(); - } - }, re -> { - }); - } - - System.err.println("Extracting cluster " + cluster[0] ); - if (cluster[0] == -1) { - return; - } - zr.forEachArticles((url, art) -> { - if (art != null) { - if (which.equals(url)) { - try { - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art); - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art); - Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData); - } catch (IOException e) { - e.printStackTrace(); - } - - } - scheduleJob(url, art); - } - }, p -> p == cluster[0]); - - } - - private static void convertAll(String[] args) throws IOException { - encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0])); - var zr = new ZIMReader(new ZIMFile(args[1])); -// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim")); - - for (int i = 0; i < 8; i++) { - Thread t = new Thread(ZimConverterMain::jobExecutor); - t.setName("Converter"); - t.start(); - - Thread t2 = new Thread(() -> { - for (; ; ) { - String pt; - try { - pt = analysisQueue.take(); - } catch (InterruptedException e) { - e.printStackTrace(); - return; - } -// var topic = new TopicWordExtractor().extractWords(pt); -// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt); -// System.out.println(Strings.join(words, ',')); - } - }); - t2.setName("Analysis"); - t2.start(); - } - - zr.forEachArticles((url, art) -> { - if (art != null) { - scheduleJob(url, art); - } - }, p -> true); - - hasData = false; - encyclopediaClient.close(); - } - - @SneakyThrows - private static void jobExecutor() { - while (hasData || !jobQueue.isEmpty()) { - var job = jobQueue.take(); - try { - job.convert(); - } - catch (Exception ex) { - System.err.println("Error in " + job.url); - ex.printStackTrace(); - } - } - } - - @SneakyThrows - private static void scheduleJob(String url, String art) { - jobQueue.put(new ConversionJob(art, url)); - } - - static final Map wordCount = new ConcurrentHashMap<>(); - static boolean isKeyword(String word) { - - int limit = 100_000; - long n = word.chars().filter(c -> c=='_').count(); - if (n == 0) limit = 2; - if (n == 1) limit = 1; - if (n == 2) limit = 1; - if (n >= 3) limit = 1; - - long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count(); - if (c-2 <= n) { - return false; - } - int hashA = word.hashCode(); - int hashB = Objects.hash(n, c, word.length(), word.charAt(0)); - long hash = (long) hashA + ((long) hashB << 32); - - return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit; - } - @AllArgsConstructor - private static class ConversionJob { - private final String data; - private final String url; - - - public void convert() throws InterruptedException { - var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data); - String pt = Jsoup.parse(page).text(); - analysisQueue.put(pt); - - /* - - String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data); - - - if (null != newData) { - archiveClient.submitWiki(Context.internal(), url, newData) - .retry(5) - .blockingSubscribe(); - - }*/ - } - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java index 7f84e6c1..ee364dcc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaMain.java @@ -6,6 +6,7 @@ import com.google.inject.Injector; import nu.marginalia.wmsa.configuration.MainClass; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.module.ConfigurationModule; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; public class EncyclopediaMain extends MainClass { private final EncyclopediaService service; @@ -15,6 +16,7 @@ public class EncyclopediaMain extends MainClass { Injector injector = Guice.createInjector( new EncyclopediaModule(), + new DatabaseModule(), new ConfigurationModule()); injector.getInstance(EncyclopediaMain.class); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java index 5f3b8519..15219021 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/encyclopedia/EncyclopediaService.java @@ -62,6 +62,8 @@ public class EncyclopediaService extends Service { Spark.get("/wiki/has", this::pathWikiHas); Spark.post("/wiki/submit", this::pathWikiSubmit); + + Spark.awaitInitialization(); } @@ -190,7 +192,6 @@ public class EncyclopediaService extends Service { Files.createDirectories(filename.getParent()); - System.out.println(new String(data)); logger.debug("Writing {} to {}", wikiUrl, filename); try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) { diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 1460375c..25df8be4 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED); CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE); -CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); \ No newline at end of file +CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP); + +---; + +DROP TABLE IF EXISTS REF_DICTIONARY; + +CREATE TABLE IF NOT EXISTS REF_DICTIONARY( + TYPE VARCHAR(16), + WORD VARCHAR(255), + DEFINITION VARCHAR(255) +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD); + +CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( + NAME VARCHAR(255), + NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)), + REF_NAME VARCHAR(255) +) +CHARACTER SET utf8mb4 +COLLATE utf8mb4_unicode_ci; + +CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); +CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file diff --git a/marginalia_nu/src/main/resources/sql/reference-data.sql b/marginalia_nu/src/main/resources/sql/reference-data.sql index 52d9abbb..733504ac 100644 --- a/marginalia_nu/src/main/resources/sql/reference-data.sql +++ b/marginalia_nu/src/main/resources/sql/reference-data.sql @@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE( CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; - CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER); CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME); \ No newline at end of file