mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Rewrote Encyclopedia loader, added functioning E2E test for new encyclopedia service
This commit is contained in:
parent
ad4521da9e
commit
ac9064096d
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge;
|
|||||||
|
|
||||||
|
|
||||||
import nu.marginalia.util.test.TestUtil;
|
import nu.marginalia.util.test.TestUtil;
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
|
import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
@ -19,7 +18,6 @@ import org.testcontainers.junit.jupiter.Container;
|
|||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
import org.testcontainers.utility.MountableFile;
|
import org.testcontainers.utility.MountableFile;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -28,7 +26,6 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
|
||||||
import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
|
|
||||||
|
|
||||||
@Tag("e2e")
|
@Tag("e2e")
|
||||||
@Testcontainers
|
@Testcontainers
|
||||||
|
@ -3,11 +3,21 @@ package nu.marginalia.wmsa.edge;
|
|||||||
|
|
||||||
import org.junit.jupiter.api.Tag;
|
import org.junit.jupiter.api.Tag;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.openqa.selenium.By;
|
||||||
|
import org.openqa.selenium.chrome.ChromeOptions;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.testcontainers.containers.BindMode;
|
||||||
|
import org.testcontainers.containers.BrowserWebDriverContainer;
|
||||||
import org.testcontainers.containers.GenericContainer;
|
import org.testcontainers.containers.GenericContainer;
|
||||||
import org.testcontainers.containers.MariaDBContainer;
|
import org.testcontainers.containers.NginxContainer;
|
||||||
import org.testcontainers.containers.Network;
|
import org.testcontainers.containers.output.Slf4jLogConsumer;
|
||||||
|
import org.testcontainers.containers.wait.strategy.Wait;
|
||||||
import org.testcontainers.junit.jupiter.Container;
|
import org.testcontainers.junit.jupiter.Container;
|
||||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||||
|
import org.testcontainers.utility.MountableFile;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.time.Duration;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
|
import static nu.marginalia.wmsa.configuration.ServiceDescriptor.ENCYCLOPEDIA;
|
||||||
|
|
||||||
@ -19,9 +29,42 @@ public class EncyclopediaE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
@Container
|
@Container
|
||||||
public GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
|
public GenericContainer<?> encyclopediaContainer = forService(ENCYCLOPEDIA, mariaDB);
|
||||||
|
@Container
|
||||||
|
public GenericContainer<?> encyclopediaLoader = new GenericContainer<>("openjdk:17-alpine")
|
||||||
|
.dependsOn(encyclopediaContainer)
|
||||||
|
.dependsOn(mariaDB)
|
||||||
|
.withNetwork(network)
|
||||||
|
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("encyclopedia-loader")))
|
||||||
|
.withCopyFileToContainer(jarFile(), "/WMSA.jar")
|
||||||
|
.withCopyFileToContainer(MountableFile.forClasspathResource("load-encyclopedia.sh"), "/load-encyclopedia.sh")
|
||||||
|
.withFileSystemBind(getModelData().toString(), "/data", BindMode.READ_ONLY)
|
||||||
|
.withCommand("sh", "load-encyclopedia.sh")
|
||||||
|
.waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
|
||||||
|
|
||||||
|
@Container
|
||||||
|
public NginxContainer<?> proxyNginx = new NginxContainer<>("nginx:stable")
|
||||||
|
.dependsOn(encyclopediaLoader)
|
||||||
|
.dependsOn(encyclopediaContainer)
|
||||||
|
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
|
||||||
|
.withCopyFileToContainer(MountableFile.forClasspathResource("nginx/encyclopedia.conf"), "/etc/nginx/conf.d/default.conf")
|
||||||
|
.withNetwork(network)
|
||||||
|
.withNetworkAliases("proxyNginx");
|
||||||
|
|
||||||
|
@Container
|
||||||
|
public BrowserWebDriverContainer<?> chrome = new BrowserWebDriverContainer<>()
|
||||||
|
.withNetwork(network)
|
||||||
|
.withCapabilities(new ChromeOptions());
|
||||||
|
|
||||||
|
private Path getModelData() {
|
||||||
|
return Path.of(System.getProperty("user.dir")).resolve("data/test");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void run() {
|
public void run() {
|
||||||
|
var driver = chrome.getWebDriver();
|
||||||
|
|
||||||
|
driver.get("http://proxyNginx/wiki/Frog");
|
||||||
|
System.out.println(driver.getTitle());
|
||||||
|
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
32
marginalia_nu/src/e2e/resources/load-encyclopedia.sh
Normal file
32
marginalia_nu/src/e2e/resources/load-encyclopedia.sh
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
mkdir -p /var/lib/wmsa/conf/
|
||||||
|
mkdir -p /var/lib/wmsa/data/
|
||||||
|
mkdir -p /data
|
||||||
|
|
||||||
|
cat > /var/lib/wmsa/conf/db.properties <<EOF
|
||||||
|
db.user=wmsa
|
||||||
|
db.pass=wmsa
|
||||||
|
db.conn=jdbc:mariadb://mariadb:3306/WMSA_prod?rewriteBatchedStatements=true
|
||||||
|
EOF
|
||||||
|
|
||||||
|
cat > /var/lib/wmsa/conf/hosts <<EOF
|
||||||
|
# service-name host-name
|
||||||
|
resource-store resource-store
|
||||||
|
renderer renderer
|
||||||
|
auth auth
|
||||||
|
api api
|
||||||
|
smhi-scraper smhi-scraper
|
||||||
|
podcast-scraper podcast-scraper
|
||||||
|
edge-index edge-index
|
||||||
|
edge-search edge-search
|
||||||
|
encyclopedia encyclopedia
|
||||||
|
edge-assistant edge-assistant
|
||||||
|
memex memex
|
||||||
|
dating dating
|
||||||
|
EOF
|
||||||
|
|
||||||
|
java -cp WMSA.jar nu.marginalia.wmsa.edge.tools.EncyclopediaLoaderTool data/wikipedia_en_100_nopic.zim
|
||||||
|
|
||||||
|
|
||||||
|
echo "ALL DONE"
|
40
marginalia_nu/src/e2e/resources/nginx/encyclopedia.conf
Normal file
40
marginalia_nu/src/e2e/resources/nginx/encyclopedia.conf
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
listen [::]:80;
|
||||||
|
server_name nginx;
|
||||||
|
|
||||||
|
location /wiki/ {
|
||||||
|
rewrite ^ $request_uri;
|
||||||
|
rewrite ^/(.*) /public/$1 break;
|
||||||
|
return 400;
|
||||||
|
proxy_pass http://encyclopedia:5040$uri;
|
||||||
|
|
||||||
|
proxy_set_header X-Context $remote_addr-$connection;
|
||||||
|
proxy_set_header X-Public "1";
|
||||||
|
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
|
||||||
|
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||||
|
proxy_set_header X-User-Agent $http_user_agent;
|
||||||
|
|
||||||
|
tcp_nodelay on;
|
||||||
|
}
|
||||||
|
location /wiki-search {
|
||||||
|
rewrite ^ $request_uri;
|
||||||
|
rewrite ^/(.*) /public/$1 break;
|
||||||
|
return 400;
|
||||||
|
proxy_pass http://encyclopedia:5040$uri;
|
||||||
|
|
||||||
|
proxy_set_header X-Context $remote_addr-$connection;
|
||||||
|
proxy_set_header X-Public "1";
|
||||||
|
proxy_set_header X-Extern-Url $scheme://$host$request_uri;
|
||||||
|
proxy_set_header X-Extern-Domain $scheme://$host;
|
||||||
|
proxy_set_header X-User-Agent $http_user_agent;
|
||||||
|
|
||||||
|
|
||||||
|
tcp_nodelay on;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://encyclopedia:5040/;
|
||||||
|
tcp_nodelay on;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
|
||||||
|
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
||||||
|
import org.openzim.ZIMTypes.ZIMFile;
|
||||||
|
import org.openzim.ZIMTypes.ZIMReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.*;
|
||||||
|
|
||||||
|
public class EncyclopediaLoaderTool {
|
||||||
|
|
||||||
|
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, InterruptedException {
|
||||||
|
convertAll(args);
|
||||||
|
encyclopediaClient.close();
|
||||||
|
System.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void convertAll(String[] args) throws IOException, InterruptedException {
|
||||||
|
var zr = new ZIMReader(new ZIMFile(args[0]));
|
||||||
|
|
||||||
|
var pool = Executors.newFixedThreadPool(8);
|
||||||
|
var sem = new Semaphore(12);
|
||||||
|
zr.forEachArticles((url, art) -> {
|
||||||
|
if (art != null) {
|
||||||
|
try {
|
||||||
|
sem.acquire();
|
||||||
|
|
||||||
|
pool.execute(() -> {
|
||||||
|
try {
|
||||||
|
convert(url, art);
|
||||||
|
} finally {
|
||||||
|
sem.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, p -> true);
|
||||||
|
|
||||||
|
sem.acquire(12);
|
||||||
|
|
||||||
|
encyclopediaClient.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void convert(String url, String art) {
|
||||||
|
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
|
||||||
|
|
||||||
|
if (null != newData) {
|
||||||
|
encyclopediaClient.submitWiki(Context.internal(), url, newData)
|
||||||
|
.retry(5)
|
||||||
|
.blockingSubscribe();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,211 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.tools;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiCleaner;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
|
||||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaClient;
|
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.openzim.ZIMTypes.ZIMFile;
|
|
||||||
import org.openzim.ZIMTypes.ZIMReader;
|
|
||||||
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.PrintWriter;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
|
|
||||||
public class ZimConverterMain {
|
|
||||||
|
|
||||||
static final LinkedBlockingQueue<ConversionJob> jobQueue = new LinkedBlockingQueue<>(100);
|
|
||||||
static final LinkedBlockingQueue<String> analysisQueue = new LinkedBlockingQueue<>(100);
|
|
||||||
static boolean hasData = true;
|
|
||||||
static final EncyclopediaClient encyclopediaClient = new EncyclopediaClient();
|
|
||||||
static NGramDict dict = new NGramDict(new LanguageModels(
|
|
||||||
Path.of("/var/lib/wmsa/model/ngrams-generous-emstr.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/tfreq-generous-emstr.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-sentence.bin"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.RDR"),
|
|
||||||
Path.of("/var/lib/wmsa/model/English.DICT"),
|
|
||||||
Path.of("/var/lib/wmsa/model/opennlp-tok.bin")
|
|
||||||
)
|
|
||||||
);
|
|
||||||
public void extractUrlList() throws IOException {
|
|
||||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
|
||||||
|
|
||||||
var urlList = zr.getURLListByURL();
|
|
||||||
|
|
||||||
try (PrintWriter pw = new PrintWriter(new FileOutputStream("/home/vlofgren/Work/wikiTitlesAndRedirects.sql"))) {
|
|
||||||
zr.forEachTitles(
|
|
||||||
ae -> {
|
|
||||||
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME) VALUES (\"%s\");\n", ae.getUrl().replace("\\", "\\\\").replace("\"", "\\\""));
|
|
||||||
},
|
|
||||||
re -> {
|
|
||||||
pw.printf("INSERT INTO REF_WIKI_TITLE(NAME, REF_NAME) VALUES (\"%s\",\"%s\");\n", re.getUrl().replace("\\", "\\\\").replace("\"", "\\\""), urlList.get(re.getRedirectIndex()).replace("\\", "\\\\").replace("\"", "\\\""));
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException {
|
|
||||||
// convertJust("Aleph_number");
|
|
||||||
// convertJust("Floyd–Steinberg_dithering");
|
|
||||||
// convertJust("Laplace's_equation");
|
|
||||||
// convertJust("John_Fahey");
|
|
||||||
// convertJust("Plotinus");
|
|
||||||
// convertJust("C++");
|
|
||||||
convertAll(args);
|
|
||||||
encyclopediaClient.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private static void convertJust(String url) {
|
|
||||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url,
|
|
||||||
Files.readString(Path.of("/home/vlofgren/Work/wiki-convert/", "in-" + url + ".html")));
|
|
||||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void extractOne(String which, int clusterId) throws IOException {
|
|
||||||
// var zr = new ZIMReader(new ZIMFile(args[1]));
|
|
||||||
var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
|
||||||
|
|
||||||
int[] cluster = new int[] { clusterId };
|
|
||||||
if (clusterId == -1) {
|
|
||||||
zr.forEachTitles(ae -> {
|
|
||||||
if (ae.getUrl().equals(which)) {
|
|
||||||
System.err.print(ae.getUrl() + " " + ae.getClusterNumber());
|
|
||||||
cluster[0] = ae.getClusterNumber();
|
|
||||||
}
|
|
||||||
}, re -> {
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
System.err.println("Extracting cluster " + cluster[0] );
|
|
||||||
if (cluster[0] == -1) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
zr.forEachArticles((url, art) -> {
|
|
||||||
if (art != null) {
|
|
||||||
if (which.equals(url)) {
|
|
||||||
try {
|
|
||||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/","in-" + url + ".html"), art);
|
|
||||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, art);
|
|
||||||
Files.writeString(Path.of("/home/vlofgren/Work/wiki-convert/", "out-" + url + ".html"), newData);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
scheduleJob(url, art);
|
|
||||||
}
|
|
||||||
}, p -> p == cluster[0]);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void convertAll(String[] args) throws IOException {
|
|
||||||
encyclopediaClient.setServiceRoute("127.0.0.1", Integer.parseInt(args[0]));
|
|
||||||
var zr = new ZIMReader(new ZIMFile(args[1]));
|
|
||||||
// var zr = new ZIMReader(new ZIMFile("/home/vlofgren/Work/wikipedia_en_all_nopic_2021-01.zim"));
|
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
Thread t = new Thread(ZimConverterMain::jobExecutor);
|
|
||||||
t.setName("Converter");
|
|
||||||
t.start();
|
|
||||||
|
|
||||||
Thread t2 = new Thread(() -> {
|
|
||||||
for (; ; ) {
|
|
||||||
String pt;
|
|
||||||
try {
|
|
||||||
pt = analysisQueue.take();
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// var topic = new TopicWordExtractor().extractWords(pt);
|
|
||||||
// var words = new NGramTextRankExtractor(dict, topic).extractWords(Collections.emptyList(), pt);
|
|
||||||
// System.out.println(Strings.join(words, ','));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
t2.setName("Analysis");
|
|
||||||
t2.start();
|
|
||||||
}
|
|
||||||
|
|
||||||
zr.forEachArticles((url, art) -> {
|
|
||||||
if (art != null) {
|
|
||||||
scheduleJob(url, art);
|
|
||||||
}
|
|
||||||
}, p -> true);
|
|
||||||
|
|
||||||
hasData = false;
|
|
||||||
encyclopediaClient.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private static void jobExecutor() {
|
|
||||||
while (hasData || !jobQueue.isEmpty()) {
|
|
||||||
var job = jobQueue.take();
|
|
||||||
try {
|
|
||||||
job.convert();
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
System.err.println("Error in " + job.url);
|
|
||||||
ex.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
private static void scheduleJob(String url, String art) {
|
|
||||||
jobQueue.put(new ConversionJob(art, url));
|
|
||||||
}
|
|
||||||
|
|
||||||
static final Map<Long, Integer> wordCount = new ConcurrentHashMap<>();
|
|
||||||
static boolean isKeyword(String word) {
|
|
||||||
|
|
||||||
int limit = 100_000;
|
|
||||||
long n = word.chars().filter(c -> c=='_').count();
|
|
||||||
if (n == 0) limit = 2;
|
|
||||||
if (n == 1) limit = 1;
|
|
||||||
if (n == 2) limit = 1;
|
|
||||||
if (n >= 3) limit = 1;
|
|
||||||
|
|
||||||
long c = word.chars().filter(ch -> ch >= 'a' && ch <= 'z').count();
|
|
||||||
if (c-2 <= n) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int hashA = word.hashCode();
|
|
||||||
int hashB = Objects.hash(n, c, word.length(), word.charAt(0));
|
|
||||||
long hash = (long) hashA + ((long) hashB << 32);
|
|
||||||
|
|
||||||
return wordCount.compute(hash, (k, v) -> v == null ? 1 : v+1) == limit;
|
|
||||||
}
|
|
||||||
@AllArgsConstructor
|
|
||||||
private static class ConversionJob {
|
|
||||||
private final String data;
|
|
||||||
private final String url;
|
|
||||||
|
|
||||||
|
|
||||||
public void convert() throws InterruptedException {
|
|
||||||
var page = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
|
|
||||||
String pt = Jsoup.parse(page).text();
|
|
||||||
analysisQueue.put(pt);
|
|
||||||
|
|
||||||
/*
|
|
||||||
|
|
||||||
String newData = new WikiCleaner().cleanWikiJunk("https://en.wikipedia.org/wiki/" + url, data);
|
|
||||||
|
|
||||||
|
|
||||||
if (null != newData) {
|
|
||||||
archiveClient.submitWiki(Context.internal(), url, newData)
|
|
||||||
.retry(5)
|
|
||||||
.blockingSubscribe();
|
|
||||||
|
|
||||||
}*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -6,6 +6,7 @@ import com.google.inject.Injector;
|
|||||||
import nu.marginalia.wmsa.configuration.MainClass;
|
import nu.marginalia.wmsa.configuration.MainClass;
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
|
||||||
public class EncyclopediaMain extends MainClass {
|
public class EncyclopediaMain extends MainClass {
|
||||||
private final EncyclopediaService service;
|
private final EncyclopediaService service;
|
||||||
@ -15,6 +16,7 @@ public class EncyclopediaMain extends MainClass {
|
|||||||
|
|
||||||
Injector injector = Guice.createInjector(
|
Injector injector = Guice.createInjector(
|
||||||
new EncyclopediaModule(),
|
new EncyclopediaModule(),
|
||||||
|
new DatabaseModule(),
|
||||||
new ConfigurationModule());
|
new ConfigurationModule());
|
||||||
injector.getInstance(EncyclopediaMain.class);
|
injector.getInstance(EncyclopediaMain.class);
|
||||||
}
|
}
|
||||||
|
@ -62,6 +62,8 @@ public class EncyclopediaService extends Service {
|
|||||||
|
|
||||||
Spark.get("/wiki/has", this::pathWikiHas);
|
Spark.get("/wiki/has", this::pathWikiHas);
|
||||||
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
Spark.post("/wiki/submit", this::pathWikiSubmit);
|
||||||
|
|
||||||
|
Spark.awaitInitialization();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -190,7 +192,6 @@ public class EncyclopediaService extends Service {
|
|||||||
|
|
||||||
Files.createDirectories(filename.getParent());
|
Files.createDirectories(filename.getParent());
|
||||||
|
|
||||||
System.out.println(new String(data));
|
|
||||||
logger.debug("Writing {} to {}", wikiUrl, filename);
|
logger.debug("Writing {} to {}", wikiUrl, filename);
|
||||||
|
|
||||||
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
try (var gos = new GZIPOutputStream(new FileOutputStream(filename.toFile()))) {
|
||||||
|
@ -254,4 +254,29 @@ CREATE INDEX IF NOT EXISTS EC_DOMAIN_TRIO ON EC_DOMAIN (STATE, DOMAIN_ALIAS, IND
|
|||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
CREATE INDEX IF NOT EXISTS EC_URL_VISITED ON EC_URL (VISITED);
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
|
CREATE INDEX IF NOT EXISTS EC_URL_VISITED_STATE ON EC_URL (VISITED, STATE);
|
||||||
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
CREATE INDEX IF NOT EXISTS EC_URL_IP ON EC_URL (IP);
|
||||||
|
|
||||||
|
---;
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS REF_DICTIONARY;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS REF_DICTIONARY(
|
||||||
|
TYPE VARCHAR(16),
|
||||||
|
WORD VARCHAR(255),
|
||||||
|
DEFINITION VARCHAR(255)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS REF_DICTIONARY_WORD ON REF_DICTIONARY (WORD);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
|
||||||
|
NAME VARCHAR(255),
|
||||||
|
NAME_LOWER VARCHAR(255) GENERATED ALWAYS AS (LOWER(NAME)),
|
||||||
|
REF_NAME VARCHAR(255)
|
||||||
|
)
|
||||||
|
CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
|
||||||
|
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);
|
@ -18,6 +18,5 @@ CREATE TABLE IF NOT EXISTS REF_WIKI_TITLE(
|
|||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
COLLATE utf8mb4_unicode_ci;
|
COLLATE utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
|
CREATE INDEX IF NOT EXISTS REF_WIKI_LOWER ON REF_WIKI_TITLE (NAME_LOWER);
|
||||||
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);
|
CREATE INDEX IF NOT EXISTS REF_WIKI_NAME ON REF_WIKI_TITLE (NAME);
|
Loading…
Reference in New Issue
Block a user