diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle
index 522d81b6..2fecceae 100644
--- a/marginalia_nu/build.gradle
+++ b/marginalia_nu/build.gradle
@@ -3,6 +3,7 @@ plugins {
id "io.freefair.lombok" version "5.3.3.3"
id "me.champeau.jmh" version "0.6.6"
+ id "de.undercouch.download" version "5.1.0"
}
repositories {
@@ -24,6 +25,19 @@ repositories {
}
}
+sourceSets {
+ e2eTest {
+ java {
+ java {
+ compileClasspath += main.output + test.output
+ runtimeClasspath += main.output + test.output
+ srcDir file('src/e2e/java')
+ }
+ resources.srcDir file('src/e2e/resources')
+ }
+ }
+}
+
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(17))
@@ -33,16 +47,9 @@ java {
dependencies {
implementation project(':third_party')
- implementation 'junit:junit:4.13.2'
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
- testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
- implementation 'org.projectlombok:lombok:1.18.22'
- annotationProcessor 'org.projectlombok:lombok:1.18.22'
-
- testCompileOnly 'org.projectlombok:lombok:1.18.22'
- testImplementation 'org.projectlombok:lombok:1.18.22'
- testAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
+ implementation 'org.projectlombok:lombok:1.18.24'
+ annotationProcessor 'org.projectlombok:lombok:1.18.24'
implementation 'com.github.jknack:handlebars:4.3.0'
implementation 'com.github.jknack:handlebars-markdown:4.2.1'
@@ -63,7 +70,7 @@ dependencies {
implementation 'com.google.guava:guava:31.1-jre'
implementation 'com.google.inject:guice:5.1.0'
- implementation 'com.github.jnr:jnr-ffi:2.1.1'
+ implementation 'com.github.jnr:jnr-ffi:2.2.12'
implementation 'org.apache.httpcomponents:httpcore:4.4.15'
implementation 'org.apache.httpcomponents:httpclient:4.5.13'
implementation 'com.github.ThatJavaNerd:JRAW:1.1.0'
@@ -74,29 +81,23 @@ dependencies {
implementation 'org.jsoup:jsoup:1.14.3'
implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2'
- implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.3'
+ implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4'
implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3'
implementation 'com.zaxxer:HikariCP:5.0.1'
- implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
+ implementation 'org.apache.opennlp:opennlp-tools:1.9.3'
implementation 'io.prometheus:simpleclient:0.15.0'
implementation 'io.prometheus:simpleclient_servlet:0.15.0'
implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
- implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
- implementation 'org.apache.opennlp:opennlp-tools:1.9.4'
- implementation 'io.prometheus:simpleclient:0.15.0'
- implementation 'io.prometheus:simpleclient_servlet:0.15.0'
- implementation 'io.prometheus:simpleclient_httpserver:0.15.0'
- implementation 'io.prometheus:simpleclient_hotspot:0.15.0'
- implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1'
+ implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3'
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
implementation 'com.syncthemall:boilerpipe:1.2.2'
implementation 'com.github.luben:zstd-jni:1.5.2-2'
- implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.3.0'
+ implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
implementation 'org.imgscalr:imgscalr-lib:4.2'
@@ -111,10 +112,33 @@ dependencies {
implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0'
implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8'
- implementation 'org.roaringbitmap:RoaringBitmap:[0.6,)'
+ implementation 'org.roaringbitmap:RoaringBitmap:0.9.27'
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
+
+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+ testCompileOnly 'org.projectlombok:lombok:1.18.24'
+ testImplementation 'org.projectlombok:lombok:1.18.24'
+ testAnnotationProcessor 'org.projectlombok:lombok:1.18.24'
+
+ e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
+ e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+ e2eTestImplementation 'org.projectlombok:lombok:1.18.24'
+ e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22'
+ e2eTestImplementation 'org.testcontainers:mariadb:1.17.1'
+ e2eTestImplementation 'org.testcontainers:nginx:1.17.1'
+ e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1'
+ e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1"
+ e2eTestImplementation "org.testcontainers:selenium:1.17.1"
+ e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4'
+ e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4'
+}
+
+configurations {
+ e2eTestImplementation.extendsFrom(testImplementation)
+
}
test {
@@ -136,4 +160,51 @@ task dbTest(type: Test) {
}
}
+task e2eTest(type: Test) {
+ maxParallelForks = 1
+ forkEvery = 1
+ maxHeapSize = "8G"
+ dependsOn ':shadowJar'
+ dependsOn 'downloadTestData'
+ dependsOn 'downloadRDRModelData'
+ dependsOn 'downloadSentenceModelData'
+ dependsOn 'downloadTokenModelData'
+ dependsOn 'downloadTermFreqData'
+
+ classpath = sourceSets.e2eTest.runtimeClasspath
+ testClassesDirs = sourceSets.e2eTest.output.classesDirs
+ useJUnitPlatform {
+ includeTags "e2e"
+ }
+}
+
+task downloadTestData(type: Download) {
+ src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim'
+ dest file('data/test/wikipedia_en_100_nopic.zim')
+ overwrite false
+}
+
+task downloadRDRModelData(type: Download) {
+ src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT',
+ 'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR'])
+ dest file('data/models/')
+ overwrite false
+}
+
+task downloadSentenceModelData(type: Download) {
+ src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin'
+ dest file('data/models/opennlp-sentence.bin')
+ overwrite false
+}
+task downloadTokenModelData(type: Download) {
+ src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin'
+ dest file('data/models/opennlp-tokens.bin')
+ overwrite false
+}
+
+task downloadTermFreqData(type: Copy) {
+ // TODO: Need hosting for this file
+ from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
+ into 'data/models'
+}
diff --git a/marginalia_nu/data/.gitignore b/marginalia_nu/data/.gitignore
new file mode 100644
index 00000000..f59ec20a
--- /dev/null
+++ b/marginalia_nu/data/.gitignore
@@ -0,0 +1 @@
+*
\ No newline at end of file
diff --git a/marginalia_nu/data/models/.gitignore b/marginalia_nu/data/models/.gitignore
new file mode 100644
index 00000000..f59ec20a
--- /dev/null
+++ b/marginalia_nu/data/models/.gitignore
@@ -0,0 +1 @@
+*
\ No newline at end of file
diff --git a/marginalia_nu/data/test/.gitignore b/marginalia_nu/data/test/.gitignore
new file mode 100644
index 00000000..f59ec20a
--- /dev/null
+++ b/marginalia_nu/data/test/.gitignore
@@ -0,0 +1 @@
+*
\ No newline at end of file
diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
new file mode 100644
index 00000000..915b1b76
--- /dev/null
+++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
@@ -0,0 +1,197 @@
+package nu.marginalia.wmsa.edge;
+
+
+import nu.marginalia.wmsa.configuration.ServiceDescriptor;
+import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain;
+import org.jsoup.Jsoup;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+import org.openqa.selenium.By;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.openzim.ZIMTypes.ZIMFile;
+import org.openzim.ZIMTypes.ZIMReader;
+import org.slf4j.LoggerFactory;
+import org.testcontainers.containers.*;
+import org.testcontainers.containers.output.Slf4jLogConsumer;
+import org.testcontainers.containers.wait.strategy.Wait;
+import org.testcontainers.junit.jupiter.Container;
+import org.testcontainers.junit.jupiter.Testcontainers;
+import org.testcontainers.utility.MountableFile;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.List;
+
+import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*;
+import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL;
+
+@Tag("e2e")
+@Testcontainers
+public class EdgeSearchE2ETest {
+ Network network = Network.newNetwork();
+
+ @Container
+ public GenericContainer> mariaDB = new MariaDBContainer<>("mariadb")
+ .withDatabaseName("WMSA_prod")
+ .withUsername("wmsa")
+ .withPassword("wmsa")
+ .withInitScript("sql/edge-crawler-cache.sql")
+ .withNetwork(network)
+ .withNetworkAliases("mariadb");
+
+ @Container
+ public GenericContainer> searchContainer = forService(EDGE_SEARCH);
+ @Container
+ public GenericContainer> assistantContainer = forService(EDGE_ASSISTANT);
+ @Container
+ public GenericContainer> indexContainer = forService(EDGE_INDEX);
+
+ @Container
+ public NginxContainer> mockWikipedia = new NginxContainer<>("nginx:stable")
+ .dependsOn(searchContainer)
+ .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("wikipedia")))
+ .withFileSystemBind(getWikipediaFiles(), "/usr/share/nginx/html/", BindMode.READ_ONLY)
+ .withNetwork(network)
+ .withNetworkAliases("wikipedia");
+
+
+ @Container
+ public BrowserWebDriverContainer> chrome = new BrowserWebDriverContainer<>()
+ .withNetwork(network)
+ .withCapabilities(new ChromeOptions());
+
+ @Container
+ public GenericContainer> crawlerContainer = new GenericContainer<>("openjdk:17-alpine")
+ .dependsOn(mockWikipedia)
+ .dependsOn(indexContainer)
+ .withNetwork(network)
+ .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
+ .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
+ .withCopyFileToContainer(jarFile(), "/WMSA.jar")
+ .withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh")
+ .withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE)
+ .withCommand("sh", "crawl.sh")
+ .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10)));
+
+ @Container
+ public NginxContainer> proxyNginx = new NginxContainer<>("nginx:stable")
+ .dependsOn(searchContainer)
+ .dependsOn(crawlerContainer)
+ .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx")))
+ .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/search.conf"), "/etc/nginx/conf.d/default.conf")
+ .withNetwork(network)
+ .withNetworkAliases("proxyNginx");
+ ;
+ public GenericContainer> forService(ServiceDescriptor service) {
+ return new GenericContainer<>("openjdk:17-alpine")
+ .dependsOn(mariaDB)
+ .withCopyFileToContainer(jarFile(), "/WMSA.jar")
+ .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh")
+ .withExposedPorts(service.port)
+ .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
+ .withNetwork(network)
+ .withNetworkAliases(service.name)
+ .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name)))
+ .withCommand("sh", "init.sh", service.name)
+ .waitingFor(Wait.forHttp("/internal/ping")
+ .forPort(service.port)
+ .withReadTimeout(Duration.ofSeconds(15)))
+ ;
+ }
+
+ public static MountableFile jarFile() {
+ Path cwd = Path.of(System.getProperty("user.dir"));
+
+ cwd = cwd.resolve("..");
+ var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar");
+ if (!Files.exists(jarFile)) {
+ System.err.println("Could not find jarFile " + jarFile);
+ throw new RuntimeException();
+ }
+ else {
+ System.out.println("jar file = " + jarFile);
+ }
+ return MountableFile.forHostPath(jarFile);
+ }
+
+ public static String modelsPath() {
+ Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models");
+ if (!Files.isDirectory(modelsPath)) {
+ System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
+ throw new RuntimeException();
+ }
+ return modelsPath.toString();
+ }
+
+ private Path getCrawlPath() {
+ return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");
+ }
+
+ private String getWikipediaFiles() {
+ Path wikipediaFiles = Path.of(System.getProperty("user.dir")).resolve("build/tmp/wikipedia");
+ Path crawlFiles = getCrawlPath();
+ Path zimFile = Path.of(System.getProperty("user.dir")).resolve("data/test/wikipedia_en_100_nopic.zim");
+
+
+ List urls = new ArrayList<>();
+ try {
+ Files.deleteIfExists(wikipediaFiles);
+ Files.createDirectories(wikipediaFiles);
+ Files.createDirectories(crawlFiles);
+
+ Files.writeString(crawlFiles.resolve("crawl.plan"), """
+ jobSpec: "/crawl/crawl.spec"
+ crawl:
+ dir: "/crawl/crawl"
+ logName: "crawl.log"
+ process:
+ dir: "/crawl/process"
+ logName: "process.log"
+ """);
+
+ Files.createDirectories(crawlFiles.resolve("crawl"));
+ Files.createDirectories(crawlFiles.resolve("process"));
+ Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log"));
+ Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log"));
+
+ var zr = new ZIMReader(new ZIMFile(zimFile.toString()));
+ zr.forEachArticles((url, art) -> {
+ urls.add("http://wikipedia/" + url + ".html");
+
+ if (art != null) {
+ try {
+ var doc = Jsoup.parse(art);
+ doc.getElementsByTag("script").remove();
+ Files.writeString(wikipediaFiles.resolve(url+".html"), doc.html());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }, pred -> true);
+ urls.forEach(System.out::println);
+ Files.writeString(wikipediaFiles.resolve("index.html"), "");
+ CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), "wikipedia", urls);
+ }
+ catch (IOException ex) {
+ ex.printStackTrace();
+ }
+ return wikipediaFiles.toString();
+ }
+
+ @Test
+ public void run() {
+ var driver = chrome.getWebDriver();
+
+ driver.get("http://proxyNginx/");
+ System.out.println(driver.getTitle());
+ System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+
+ driver.get("http://proxyNginx/search?query=bird&profile=corpo");
+ System.out.println(driver.getTitle());
+ System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+ }
+}
diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh
new file mode 100644
index 00000000..411fe708
--- /dev/null
+++ b/marginalia_nu/src/e2e/resources/crawl.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+mkdir -p /var/lib/wmsa/conf/
+
+cat > /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/hosts < /var/lib/wmsa/suggestions.txt < /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/ranking-settings.yaml < /var/lib/wmsa/conf/hosts < hostsMap = new HashMap<>(ServiceDescriptor.values().length);
+
+ public HostsFile(Path fileName) throws IOException {
+ var lines = Files.readAllLines(fileName);
+ for (var line : lines) {
+ if (line.startsWith("#") || line.isBlank()) {
+ continue;
+ }
+ String[] parts = line.strip().split(" ");
+ if (parts.length != 2) throw new IllegalArgumentException("Invalid hosts file entry " + line);
+ String descriptorName = parts[0];
+ String hostName = parts[1];
+
+ try {
+ hostsMap.put(ServiceDescriptor.byName(descriptorName), hostName);
+ }
+ catch (IllegalArgumentException ex) {
+ throw new IllegalArgumentException("ServiceDescriptor " + descriptorName + " invalid");
+ }
+ }
+ }
+
+ public HostsFile() {
+ for (var sd : ServiceDescriptor.values()) {
+ hostsMap.put(sd, "localhost");
+ }
+ }
+
+ public String getHost(ServiceDescriptor sd) {
+ return hostsMap.get(sd);
+ }
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java
index 9bfe2a2b..165272de 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java
@@ -21,6 +21,9 @@ import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
import org.apache.logging.log4j.core.lookup.MainMapLookup;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -49,13 +52,21 @@ public enum ServiceDescriptor {
TEST_1("test-1", 0, null),
TEST_2("test-2", 0, null);
+ private static HostsFile hostsFile;
+ public synchronized String getHost() {
+ if (hostsFile == null) {
+ hostsFile = WmsaHome.getHostsFile();
+ }
+ return hostsFile.getHost(this);
+ }
+
public static ServiceDescriptor byName(String name) {
for (var v : values()) {
if (v.name.equals(name)) {
return v;
}
}
- throw new IllegalArgumentException(name);
+ throw new IllegalArgumentException("Invalid ServiceDescriptor " + name);
}
public final String name;
public final Class> mainClass;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java
index f749c9a6..2a96de20 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java
@@ -1,16 +1,31 @@
package nu.marginalia.wmsa.configuration;
+import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
public class WmsaHome {
private static final String DEFAULT = "/var/lib/wmsa";
- public static Path get() {
+ public static Path getHomePath() {
var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT));
if (!Files.isDirectory(ret)) {
throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists");
}
return ret;
}
+
+ public static HostsFile getHostsFile() {
+ Path hostsFile = getHomePath().resolve("conf/hosts");
+ if (Files.isRegularFile(hostsFile)) {
+ try {
+ return new HostsFile(hostsFile);
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to load hosts file " + hostsFile, e);
+ }
+ }
+ else {
+ return new HostsFile();
+ }
+ }
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java
index 8ce96c3a..609058ed 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java
@@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule {
}
private Properties loadDbProperties() {
- Path propDir = WmsaHome.get().resolve("db.properties");
+ Path propDir = WmsaHome.getHomePath().resolve("db.properties");
if (!Files.isRegularFile(propDir)) {
throw new IllegalStateException("Database properties file " + propDir + " does not exist");
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java
index 84d44326..6fe88d08 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java
@@ -33,7 +33,7 @@ public class LoaderMain {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final LoaderFactory loaderFactory;
private final EdgeIndexClient indexClient;
- private final boolean running = true;
+ private volatile boolean running = true;
final Thread processorThread = new Thread(this::processor, "Processor Thread");
@@ -82,8 +82,11 @@ public class LoaderMain {
load(entry.path(), entry.cnt());
});
+ running = false;
processorThread.join();
indexClient.close();
+
+ System.exit(0);
}
private volatile static int loadTotal;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
new file mode 100644
index 00000000..050152bc
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
@@ -0,0 +1,81 @@
+package nu.marginalia.wmsa.edge.converting;
+
+import nu.marginalia.wmsa.configuration.ServiceDescriptor;
+import nu.marginalia.wmsa.configuration.module.DatabaseModule;
+import okhttp3.MediaType;
+import okhttp3.OkHttpClient;
+import okhttp3.Request;
+import okhttp3.RequestBody;
+import okio.BufferedSink;
+import org.jetbrains.annotations.Nullable;
+
+import java.io.IOException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.sql.SQLException;
+import java.util.concurrent.TimeUnit;
+
+import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
+
+public class ReindexTriggerMain {
+
+ public static void main(String... args) throws IOException, SQLException {
+ var db = new DatabaseModule();
+ var client = new OkHttpClient.Builder()
+ .connectTimeout(100, TimeUnit.MILLISECONDS)
+ .readTimeout(15, TimeUnit.MINUTES)
+ .retryOnConnectionFailure(true)
+ .followRedirects(true)
+ .build();
+
+ try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) {
+ var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100");
+ while (rs.next()) {
+ System.out.printf("%d %s %s %d\n",
+ rs.getInt(1),
+ rs.getString(2),
+ rs.getString(3),
+ rs.getInt(4));
+ }
+
+ rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100");
+ while (rs.next()) {
+ System.out.printf("%d %d %s %d %s\n",
+ rs.getInt(1),
+ rs.getInt(2),
+ rs.getString(3),
+ rs.getInt(4),
+ rs.getString(5));
+
+ }
+
+ stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0");
+ stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT");
+ stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT");
+ stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT");
+ }
+
+ var rb = new RequestBody() {
+
+ @Nullable
+ @Override
+ public MediaType contentType() {
+ return MediaType.parse("text/plain");
+ }
+
+ @Override
+ public void writeTo(BufferedSink sink) throws IOException {
+ sink.writeString("NOOP", Charset.defaultCharset());
+ }
+ };
+
+ client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
+ client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
+ for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
+ client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
+ }
+
+ }
+
+
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java
index d370b3c0..46d71505 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java
@@ -25,6 +25,8 @@ public class IndexLoadKeywords implements Runnable {
private final Thread runThread;
private volatile boolean canceled = false;
+ private static final int index = Integer.getInteger("keyword-index", 1);
+
@Inject
public IndexLoadKeywords(EdgeIndexClient client) {
this.client = client;
@@ -37,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
- client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, 1).blockingSubscribe();
+ client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
}
}
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
index 6f1037ba..324085fc 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@@ -79,15 +79,21 @@ public class DocumentProcessor {
ret.url = new EdgeUrl(crawledDocument.url);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
- if (ret.state == EdgeUrlState.OK && isAcceptedContentType(crawledDocument)) {
- var detailsWords = createDetails(crawledDomain, crawledDocument);
+ if (ret.state == EdgeUrlState.OK) {
- if (detailsWords.details().quality < minDocumentQuality) {
- throw new DisqualifiedException(DisqualificationReason.QUALITY);
+ if (isAcceptedContentType(crawledDocument)) {
+ var detailsWords = createDetails(crawledDomain, crawledDocument);
+
+ if (detailsWords.details().quality < minDocumentQuality) {
+ throw new DisqualifiedException(DisqualificationReason.QUALITY);
+ }
+
+ ret.details = detailsWords.details();
+ ret.words = detailsWords.words();
+ }
+ else {
+ throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
}
-
- ret.details = detailsWords.details();
- ret.words = detailsWords.words();
}
else {
throw new DisqualifiedException(DisqualificationReason.STATUS);
@@ -95,7 +101,7 @@ public class DocumentProcessor {
}
catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
- logger.debug("Disqualified {}: {}", ret.url, ex.reason);
+ logger.info("Disqualified {}: {}", ret.url, ex.reason);
}
catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
index c407fbe8..2f25d6d7 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
@@ -73,7 +73,7 @@ public class CrawlJobExtractorMain {
private final EdgeDomainBlacklistImpl blacklist;
private final Connection conn;
- private final HashFunction hasher = Hashing.murmur3_128(0);
+ private static final HashFunction hasher = Hashing.murmur3_128(0);
public static void main(String... args) throws SQLException, IOException {
Driver driver = new Driver();
@@ -97,6 +97,19 @@ public class CrawlJobExtractorMain {
}
}
+ public static void writeSpec(Path outFile, String domain, List urls) throws IOException {
+ Gson gson = new GsonBuilder().create();
+
+ try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
+ var job = new CrawlingSpecification();
+ job.crawlDepth = urls.size();
+ job.domain = domain;
+ job.id = createId(new EdgeDomain(domain));
+ job.urls = urls;
+ out.println(gson.toJson(job));
+ }
+ }
+
private record DomainWithId(String domainName, int id) {}
private Stream extractDomains() {
@@ -186,11 +199,11 @@ public class CrawlJobExtractorMain {
return spec;
}
- private String createId(DomainWithId domainWithId) {
+ private static String createId(DomainWithId domainWithId) {
return hasher.hashUnencodedChars(domainWithId.domainName).toString();
}
- private String createId(EdgeDomain domain) {
+ private static String createId(EdgeDomain domain) {
return hasher.hashUnencodedChars(domain.toString()).toString();
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java
index ea62e742..d81e348b 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java
@@ -79,6 +79,9 @@ public class CrawlerMain implements AutoCloseable {
try (var crawler = new CrawlerMain(plan)) {
crawler.run();
}
+
+ // TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why?
+ System.exit(0);
}
private CrawledDomain fetchDomain(CrawlingSpecification specification) {
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
index 467376f5..87d4f3df 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
@@ -20,7 +20,7 @@ import java.time.LocalDateTime;
import java.util.*;
public class CrawlerRetreiver {
- private static final long DEFAULT_CRAWL_DELAY_MS = 1000;
+ private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000);
private final LinkedList queue = new LinkedList<>();
private final HttpFetcher fetcher;
private final HashSet visited;
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
index 3c65464e..b590af55 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
@@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException;
public class EdgeIndexControl {
@@ -19,11 +20,15 @@ public class EdgeIndexControl {
System.gc();
for (IndexBlock block : IndexBlock.values()) {
+ try {
+ servicesFactory.getIndexConverter(id, block);
- servicesFactory.getIndexConverter(id, block);
+ System.runFinalization();
+ System.gc();
+ }
+ catch (ConversionUnnecessaryException unnecessary) {
- System.runFinalization();
- System.gc();
+ }
}
System.runFinalization();
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java
index d06a7b22..986f1874 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java
@@ -13,12 +13,18 @@ public class EdgeIndexModule extends AbstractModule {
public void configure() {
- bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
+ if (Boolean.getBoolean("small-ram")) {
+ bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
+ }
+ else {
+ bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
+ }
+
}
@Provides
public RankingSettings rankingSettings() {
- Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml");
+ Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
return RankingSettings.from(dir);
}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
index 9b26989c..fb58ac0e 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
@@ -88,8 +88,8 @@ public class IndexServicesFactory {
return new DictionaryReader(getDictionaryWriter());
}
- @SneakyThrows
- public SearchIndexConverter getIndexConverter(int id, IndexBlock block) {
+
+ public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException {
return new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id),
indexWriteWordsFile.get(id, block.id),
@@ -146,14 +146,17 @@ public class IndexServicesFactory {
public Callable switchFilesJob(int id) {
return () -> {
for (int block = 0; block < IndexBlock.values().length; block++) {
- Files.move(
- indexWriteWordsFile.get(id, block).toPath(),
- indexReadWordsFile.get(id, block).toPath(),
- StandardCopyOption.REPLACE_EXISTING);
- Files.move(
- indexWriteUrlsFile.get(id, block).toPath(),
- indexReadUrlsFile.get(id, block).toPath(),
- StandardCopyOption.REPLACE_EXISTING);
+ if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
+ Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
+ Files.move(
+ indexWriteWordsFile.get(id, block).toPath(),
+ indexReadWordsFile.get(id, block).toPath(),
+ StandardCopyOption.REPLACE_EXISTING);
+ Files.move(
+ indexWriteUrlsFile.get(id, block).toPath(),
+ indexReadUrlsFile.get(id, block).toPath(),
+ StandardCopyOption.REPLACE_EXISTING);
+ }
}
return true;
};
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java
new file mode 100644
index 00000000..fd7f529f
--- /dev/null
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java
@@ -0,0 +1,10 @@
+package nu.marginalia.wmsa.edge.index.service.index;
+
+public class ConversionUnnecessaryException extends Exception {
+ public ConversionUnnecessaryException() {
+
+ }
+
+ @Override
+ public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
+}
diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java
index 95a47a69..c9b69386 100644
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java
@@ -61,6 +61,7 @@ public class SearchIndexConverter {
@Named("edge-index-write-urls-file") File outputFileUrls,
SearchIndexPartitioner partitioner,
EdgeDomainBlacklist blacklist)
+ throws ConversionUnnecessaryException
{
this.block = block;
this.bucketId = bucketId;
@@ -77,16 +78,21 @@ public class SearchIndexConverter {
this.fileLength = raf.readLong();
this.wordCount = raf.readInt();
+ if (fileLength <= FILE_HEADER_SIZE) {
+ throw new ConversionUnnecessaryException();
+ }
+
var inputChannel = raf.getChannel();
ByteBuffer buffer = ByteBuffer.allocateDirect(10_000);
- urlsFileSize = getUrlsSize(buffer, raf);
+ urlsFileSize = getUrlsSize(buffer, inputChannel);
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
+
var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
- urlsTmpFileChannel = new RandomAccessFile(tmpUrlsFile.toFile(), "rw").getChannel();
+ urlsTmpFileChannel = urlsTmpFileRaf.getChannel();
urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false);
urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256);
@@ -114,6 +120,139 @@ public class SearchIndexConverter {
}
+ private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException {
+ channel.position(FILE_HEADER_SIZE);
+
+ var reader = new IndexReader(buffer, channel) {
+ public long size;
+
+ @Override
+ public void eachWord(long urlId, int wordId) {
+ size++;
+ }
+ };
+
+ reader.read();
+
+ logger.info("Blacklist filtered {} URLs", reader.filtered);
+ logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
+
+ return reader.size;
+ }
+
+ private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
+ logger.debug("Table size = {}", wordIndexTable.length);
+ int[] wordIndex = new int[wordIndexTable.length];
+ raf.seek(FILE_HEADER_SIZE);
+
+ var channel = raf.getChannel();
+
+ try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
+ var reader = new IndexReader(buffer, channel) {
+ @Override
+ public void eachWord(long urlId, int wordId) throws IOException {
+ if (wordId >= wordIndex.length)
+ return;
+
+ if (wordId != 0) {
+ if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
+ logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
+ wordId,
+ wordIndex[wordId],
+ wordIndexTable[wordId - 1],
+ wordIndexTable[wordId]);
+ throw new IllegalStateException();
+ }
+ }
+ if (wordId > 0) {
+ rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
+ } else {
+ rwf.put(wordIndex[wordId]++, translateUrl(urlId));
+ }
+ }
+ };
+
+ reader.read();
+
+ rwf.write(urlsTmpFileChannel);
+ }
+
+ urlsTmpFileChannel.force(false);
+
+ logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
+
+ if (wordIndexTable.length > 0) {
+ logger.debug("Sorting urls table");
+ sortUrls(wordIndexTable);
+ urlsTmpFileMap.force();
+ }
+ else {
+ logger.warn("urls table empty -- nothing to sort");
+ }
+
+
+ long idx = 0;
+
+ try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
+ var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
+
+ if (wordIndexTable[0] != 0) {
+ int start = 0;
+ int end = (int) wordIndexTable[0];
+
+ idx += writer.write(idx, (int) wordIndexTable[0],
+ offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
+ }
+
+ for (int i = 1; i < wordIndexTable.length; i++) {
+ if (wordIndexTable[i] != wordIndexTable[i - 1]) {
+ long start = wordIndexTable[i-1];
+ long end = wordIndexTable[i];
+
+ idx += writer.write(idx, (int) (end-start),
+ offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ @SneakyThrows
+ private void sortUrls(long[] wordIndices) {
+ urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
+
+ for (int i = 1; i < wordIndices.length; i++) {
+ urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
+ }
+ }
+
+ private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
+ inputChannel.position(FILE_HEADER_SIZE);
+
+ logger.debug("Table size = {}", wordCount);
+ WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
+ ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
+
+ logger.debug("Reading words");
+
+ var reader = new IndexReader(buffer, inputChannel) {
+ @Override
+ public void eachWord(long urlId, int wordId) {
+ wordsTableWriter.acceptWord(wordId);
+ }
+ };
+ reader.read();
+
+ logger.debug("Rearranging table");
+
+ inputChannel.position(FILE_HEADER_SIZE);
+
+ wordsTableWriter.write(outputFileWords);
+
+ return wordsTableWriter.getTable();
+ }
+
@RequiredArgsConstructor
private class IndexReader {
private final ByteBuffer buffer;
@@ -193,7 +332,7 @@ public class SearchIndexConverter {
public void eachUrl(Lock lock, int count, long urlId) throws IOException {
for (int i = 0; i < count; i++) {
int wordId = buffer.getInt();
- if (acceptWord(lock, urlId, wordId, i, block.id)) {
+ if (acceptWord(lock, urlId)) {
eachWord(urlId, wordId);
}
}
@@ -201,183 +340,16 @@ public class SearchIndexConverter {
public void eachWord(long urlId, int wordId) throws IOException {
}
- }
- private long getUrlsSize(ByteBuffer buffer, RandomAccessFile raf) throws IOException {
- raf.seek(FILE_HEADER_SIZE);
+ boolean acceptWord(Lock lock, long urlId) {
+ int domainId = (int) (urlId >>> 32L);
- var channel = raf.getChannel();
-
- var reader = new IndexReader(buffer, channel) {
- public long size;
-
- @Override
- public void eachWord(long urlId, int wordId) {
- size++;
- }
- };
-
- reader.read();
-
- logger.info("Blacklist filtered {} URLs", reader.filtered);
- logger.debug("URLs Size {} Mb", channel.position()/(1024*1024));
-
- return reader.size;
- }
-
- private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException {
- logger.debug("Table size = {}", wordIndexTable.length);
- int[] wordIndex = new int[wordIndexTable.length];
- raf.seek(FILE_HEADER_SIZE);
-
- var channel = raf.getChannel();
-
- try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) {
- var reader = new IndexReader(buffer, channel) {
- @Override
- public void eachWord(long urlId, int wordId) throws IOException {
- if (wordId >= wordIndex.length)
- return;
-
- if (wordId != 0) {
- if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) {
- logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}",
- wordId,
- wordIndex[wordId],
- wordIndexTable[wordId - 1],
- wordIndexTable[wordId]);
- throw new IllegalStateException();
- }
- }
- if (wordId > 0) {
- rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId));
- } else {
- rwf.put(wordIndex[wordId]++, translateUrl(urlId));
- }
- }
- };
-
- reader.read();
-
- rwf.write(urlsTmpFileChannel);
- }
-
- urlsTmpFileChannel.force(false);
-
- logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024));
-
- if (wordIndexTable.length > 0) {
- logger.debug("Sorting urls table");
- sortUrls(wordIndexTable);
- urlsTmpFileMap.force();
- }
- else {
- logger.warn("urls table empty -- nothing to sort");
- }
-
-
- long idx = 0;
-
- var copyBuffer = ByteBuffer.allocateDirect(4096);
- try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) {
- var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext);
-
- if (wordIndexTable[0] != 0) {
- int start = 0;
- int end = (int) wordIndexTable[0];
-
- idx += writer.write(idx, (int) wordIndexTable[0],
- offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
+ if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
+ return false;
}
- for (int i = 1; i < wordIndexTable.length; i++) {
- if (wordIndexTable[i] != wordIndexTable[i - 1]) {
- long start = wordIndexTable[i-1];
- long end = wordIndexTable[i];
-
- idx += writer.write(idx, (int) (end-start),
- offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end));
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
+ return true;
}
-
- logger.warn("BTrees generated");
- }
-
- public void transfer(ByteBuffer buffer, MultimapFileLong dest, FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException {
- int tbw = 0;
-
- buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd - sourceStart)*8));
- while (sourceEnd - sourceStart - tbw > buffer.limit()/8) {
- int bw = 0;
- while (buffer.position() < buffer.limit()) {
- int r = sourceChannel.read(buffer, sourceStart*8 + bw);
- if (r < 0) {
- throw new IOException("");
- }
- bw += r;
- }
- buffer.flip();
- dest.write(buffer.asLongBuffer(), destOffset + tbw);
- tbw += bw/8;
- buffer.clear();
- buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd*8 - sourceStart*8 - tbw)));
- }
- buffer.clear();
- buffer.limit((int)(sourceEnd - (sourceStart + tbw))*8);
- int bw = 0;
- while (bw < buffer.limit()) {
- bw += sourceChannel.read(buffer, sourceStart + bw);
- }
- buffer.flip();
- dest.write(buffer.asLongBuffer(), destOffset + tbw);
- }
-
- @SneakyThrows
- private void sortUrls(long[] wordIndices) {
- urlTmpFileSorter.sort( 0, (int) wordIndices[0]);
-
- for (int i = 1; i < wordIndices.length; i++) {
- urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1]));
- }
- }
-
- private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception {
- inputChannel.position(FILE_HEADER_SIZE);
-
- logger.debug("Table size = {}", wordCount);
- WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount);
- ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE);
-
- logger.debug("Reading words");
-
- var reader = new IndexReader(buffer, inputChannel) {
- @Override
- public void eachWord(long urlId, int wordId) {
- wordsTableWriter.acceptWord(wordId);
- }
- };
- reader.read();
-
- logger.debug("Rearranging table");
-
- inputChannel.position(FILE_HEADER_SIZE);
-
- wordsTableWriter.write(outputFileWords);
-
- return wordsTableWriter.getTable();
- }
-
- boolean acceptWord(Lock lock, long urlId, int wordId, int wordIdx, int block) {
- int domainId = (int) (urlId >>> 32L);
-
- if (!partitioner.filterUnsafe(lock, domainId, bucketId)) {
- return false;
- }
-
- return true;
}
}
diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
index 2f706bba..1460375c 100644
--- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
+++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
@@ -18,7 +18,14 @@ DROP VIEW IF EXISTS EC_URL_PART_HASH;
DROP TABLE IF EXISTS EC_URL_WORD;
DROP TABLE IF EXISTS EC_DICTIONARY;
+DROP TABLE IF EXISTS DOMAIN_METADATA;
+CREATE TABLE IF NOT EXISTS DOMAIN_METADATA (
+ ID INT PRIMARY KEY,
+ KNOWN_URLS INT DEFAULT 0,
+ VISITED_URLS INT DEFAULT 0,
+ GOOD_URLS INT DEFAULT 0
+);
CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN (
ID INT PRIMARY KEY AUTO_INCREMENT,
diff --git a/marginalia_nu/src/test/java/EmptyTest.java b/marginalia_nu/src/test/java/EmptyTest.java
deleted file mode 100644
index e789f2cf..00000000
--- a/marginalia_nu/src/test/java/EmptyTest.java
+++ /dev/null
@@ -1,8 +0,0 @@
-import org.junit.jupiter.api.Test;
-
-public class EmptyTest {
- @Test
- public void test() {
-
- }
-}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java
index 2d83c3c9..2340492e 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java
@@ -1,5 +1,6 @@
package nu.marginalia.util;
+import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
import java.nio.file.Files;
@@ -7,10 +8,9 @@ import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
- private static final Path LANGUAGE_MODELS_DEFAULT = Path.of("/home/vlofgren/Work/ngrams/");
-
- public static LanguageModels getLanguageModels() {
+ private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
+ public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
@@ -18,14 +18,20 @@ public class TestLanguageModels {
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
+ return languageModelsHome;
+ }
+
+ public static LanguageModels getLanguageModels() {
+
+ var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("ngrams-generous-emstr.bin"),
languageModelsHome.resolve("tfreq-generous-emstr.bin"),
- languageModelsHome.resolve("opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"),
+ languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
- languageModelsHome.resolve("opennlp-tok.bin")
+ languageModelsHome.resolve("opennlp-tokens.bin")
);
}
}
diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
index 2e9c6bc4..84b9f165 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java
@@ -18,8 +18,13 @@ public class TestUtil {
@SneakyThrows
public static HikariDataSource getConnection() {
+ return getConnection("jdbc:mysql://localhost:3306/WMSA_test");
+ }
+
+ @SneakyThrows
+ public static HikariDataSource getConnection(String connString) {
HikariConfig config = new HikariConfig();
- config.setJdbcUrl("jdbc:mysql://localhost:3306/WMSA_test");
+ config.setJdbcUrl(connString);
config.setUsername("wmsa");
config.setPassword("wmsa");
config.setMaximumPoolSize(16);
@@ -29,6 +34,7 @@ public class TestUtil {
return new HikariDataSource(config);
}
+
@SneakyThrows
public static void evalScript(HikariDataSource hds, String scriptFile) {
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java
new file mode 100644
index 00000000..2670039e
--- /dev/null
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java
@@ -0,0 +1,69 @@
+package nu.marginalia.wmsa.configuration;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+class HostsFileTest {
+ Path tempFile;
+
+ @BeforeEach
+ public void setUp() throws IOException {
+ tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
+ }
+
+ @AfterEach
+ public void tearDown() throws IOException {
+ tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp");
+ }
+
+ @Test
+ public void testParseSunnyDay() throws IOException {
+ Files.writeString(tempFile, """
+ # Comment
+ edge-index 192.168.0.1
+ edge-search 192.168.1.1
+
+ auth 127.0.0.55
+
+
+ """);
+ var hf = new HostsFile(tempFile);
+
+ Assertions.assertEquals("192.168.0.1", hf.getHost(ServiceDescriptor.EDGE_INDEX));
+ }
+
+ @Test
+ public void testTooLong() throws IOException {
+ Files.writeString(tempFile, """
+ edge-index 192.168.0.1 this is where my homie lives
+ """);
+
+ assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
+ }
+
+ @Test
+ public void testTooShort() throws IOException {
+ Files.writeString(tempFile, """
+ edge-index
+ """);
+
+ assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
+ }
+
+ @Test
+ public void testBadName() throws IOException {
+ Files.writeString(tempFile, """
+ garum-factory 127.0.0.1
+ """);
+
+ assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile));
+ }
+}
\ No newline at end of file
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
index cf497193..180576fc 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java
@@ -42,7 +42,7 @@ class DictionaryWriterTest {
System.out.println(hitsTotal);
}
*/
- @Test @Disabled
+ @Test @Disabled @SneakyThrows
public void convert() {
new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"),
new File("/home/vlofgren/page-index-0.dat"),
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
index abf7c4a6..f42f2d36 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java
@@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.index.service;
+import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner;
@@ -16,7 +17,7 @@ class SearchIndexConverterTest {
private final Logger logger = LoggerFactory.getLogger(getClass());
- @Test @Disabled
+ @Test @Disabled @SneakyThrows
public void test() {
// File dictFile = new File("/home/vlofgren/dictionary.dat");
File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat");
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
index 4a1e3e0d..f9cd8a6a 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java
@@ -63,7 +63,7 @@ class SearchIndexWriterTest {
return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray();
}
- @Test
+ @Test @SneakyThrows
void put() throws IOException {
writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob"));
writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello"));
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java
index 20d11538..ce9f59ea 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java
@@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.query;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels;
-import org.junit.BeforeClass;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -17,7 +17,7 @@ class BodyQueryParserTest {
private static EnglishDictionary englishDictionary;
private static final LanguageModels lm = TestLanguageModels.getLanguageModels();
- @BeforeClass
+ @BeforeAll
public static void init() {
dict = new NGramDict(lm);
englishDictionary = new EnglishDictionary(dict);
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java
index 27e3a925..9699bcf9 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java
@@ -12,8 +12,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
-import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@@ -40,7 +40,7 @@ class GemtextChangeTest {
static final Logger logger = LoggerFactory.getLogger(GemtextChangeTest.class);
- @BeforeClass
+ @BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java
index 65cff6d6..8aefc613 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java
@@ -14,8 +14,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
-import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@@ -47,7 +47,7 @@ class GemtextTaskUpdateTest {
static final Logger logger = LoggerFactory.getLogger(GemtextTaskUpdateTest.class);
- @BeforeClass
+ @BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {
diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java
index c2be2141..bfe3b104 100644
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java
@@ -11,8 +11,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes;
import nu.marginalia.wmsa.memex.system.MemexFileWriter;
import nu.marginalia.wmsa.memex.system.MemexGitRepo;
import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem;
-import org.junit.BeforeClass;
import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
@@ -38,7 +38,7 @@ class GemtextTombstoneUpdateCaclulatorTest {
static final Logger logger = LoggerFactory.getLogger(GemtextTombstoneUpdateCaclulatorTest.class);
- @BeforeClass
+ @BeforeAll
public static void init() {
RxJavaPlugins.setErrorHandler(e -> {