From cd3cae0ad53de0a58f20f92adda1cbba22a8220c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 25 May 2022 18:02:19 +0200 Subject: [PATCH] Create first E2E-test with TestContainers --- marginalia_nu/build.gradle | 113 ++++-- marginalia_nu/data/.gitignore | 1 + marginalia_nu/data/models/.gitignore | 1 + marginalia_nu/data/test/.gitignore | 1 + .../wmsa/edge/EdgeSearchE2ETest.java | 197 +++++++++++ marginalia_nu/src/e2e/resources/crawl.sh | 78 +++++ marginalia_nu/src/e2e/resources/init.sh | 61 ++++ .../src/e2e/resources/log4j2.properties | 15 + .../src/e2e/resources/nginx/search.conf | 25 ++ .../wmsa/client/AbstractClient.java | 2 +- .../wmsa/client/AbstractDynamicClient.java | 2 +- .../wmsa/configuration/HostsFile.java | 45 +++ .../wmsa/configuration/ServiceDescriptor.java | 13 +- .../wmsa/configuration/WmsaHome.java | 17 +- .../configuration/module/DatabaseModule.java | 2 +- .../wmsa/edge/converting/LoaderMain.java | 5 +- .../edge/converting/ReindexTriggerMain.java | 81 +++++ .../converting/loader/IndexLoadKeywords.java | 4 +- .../processor/DocumentProcessor.java | 22 +- .../edge/crawling/CrawlJobExtractorMain.java | 19 +- .../wmsa/edge/crawling/CrawlerMain.java | 3 + .../crawling/retreival/CrawlerRetreiver.java | 2 +- .../wmsa/edge/index/EdgeIndexControl.java | 11 +- .../wmsa/edge/index/EdgeIndexModule.java | 10 +- .../wmsa/edge/index/IndexServicesFactory.java | 23 +- .../index/ConversionUnnecessaryException.java | 10 + .../service/index/SearchIndexConverter.java | 322 ++++++++---------- .../main/resources/sql/edge-crawler-cache.sql | 7 + marginalia_nu/src/test/java/EmptyTest.java | 8 - .../marginalia/util/TestLanguageModels.java | 16 +- .../java/nu/marginalia/util/TestUtil.java | 8 +- .../wmsa/configuration/HostsFileTest.java | 69 ++++ .../index/service/DictionaryWriterTest.java | 2 +- .../service/SearchIndexConverterTest.java | 3 +- .../index/service/SearchIndexWriterTest.java | 2 +- .../search/query/BodyQueryParserTest.java | 4 +- .../wmsa/memex/change/GemtextChangeTest.java | 4 +- .../memex/change/GemtextTaskUpdateTest.java | 4 +- .../GemtextTombstoneUpdateCaclulatorTest.java | 4 +- 39 files changed, 961 insertions(+), 255 deletions(-) create mode 100644 marginalia_nu/data/.gitignore create mode 100644 marginalia_nu/data/models/.gitignore create mode 100644 marginalia_nu/data/test/.gitignore create mode 100644 marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java create mode 100644 marginalia_nu/src/e2e/resources/crawl.sh create mode 100644 marginalia_nu/src/e2e/resources/init.sh create mode 100644 marginalia_nu/src/e2e/resources/log4j2.properties create mode 100644 marginalia_nu/src/e2e/resources/nginx/search.conf create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/HostsFile.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java delete mode 100644 marginalia_nu/src/test/java/EmptyTest.java create mode 100644 marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 522d81b6..2fecceae 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -3,6 +3,7 @@ plugins { id "io.freefair.lombok" version "5.3.3.3" id "me.champeau.jmh" version "0.6.6" + id "de.undercouch.download" version "5.1.0" } repositories { @@ -24,6 +25,19 @@ repositories { } } +sourceSets { + e2eTest { + java { + java { + compileClasspath += main.output + test.output + runtimeClasspath += main.output + test.output + srcDir file('src/e2e/java') + } + resources.srcDir file('src/e2e/resources') + } + } +} + java { toolchain { languageVersion.set(JavaLanguageVersion.of(17)) @@ -33,16 +47,9 @@ java { dependencies { implementation project(':third_party') - implementation 'junit:junit:4.13.2' - testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' - testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' - implementation 'org.projectlombok:lombok:1.18.22' - annotationProcessor 'org.projectlombok:lombok:1.18.22' - - testCompileOnly 'org.projectlombok:lombok:1.18.22' - testImplementation 'org.projectlombok:lombok:1.18.22' - testAnnotationProcessor 'org.projectlombok:lombok:1.18.22' + implementation 'org.projectlombok:lombok:1.18.24' + annotationProcessor 'org.projectlombok:lombok:1.18.24' implementation 'com.github.jknack:handlebars:4.3.0' implementation 'com.github.jknack:handlebars-markdown:4.2.1' @@ -63,7 +70,7 @@ dependencies { implementation 'com.google.guava:guava:31.1-jre' implementation 'com.google.inject:guice:5.1.0' - implementation 'com.github.jnr:jnr-ffi:2.1.1' + implementation 'com.github.jnr:jnr-ffi:2.2.12' implementation 'org.apache.httpcomponents:httpcore:4.4.15' implementation 'org.apache.httpcomponents:httpclient:4.5.13' implementation 'com.github.ThatJavaNerd:JRAW:1.1.0' @@ -74,29 +81,23 @@ dependencies { implementation 'org.jsoup:jsoup:1.14.3' implementation group: 'com.github.crawler-commons', name: 'crawler-commons', version: '1.2' - implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.3' + implementation 'org.mariadb.jdbc:mariadb-java-client:3.0.4' implementation group: 'net.sf.trove4j', name: 'trove4j', version: '3.0.3' implementation 'com.zaxxer:HikariCP:5.0.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.4' + implementation 'org.apache.opennlp:opennlp-tools:1.9.3' implementation 'io.prometheus:simpleclient:0.15.0' implementation 'io.prometheus:simpleclient_servlet:0.15.0' implementation 'io.prometheus:simpleclient_httpserver:0.15.0' implementation 'io.prometheus:simpleclient_hotspot:0.15.0' - implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1' - implementation 'org.apache.opennlp:opennlp-tools:1.9.4' - implementation 'io.prometheus:simpleclient:0.15.0' - implementation 'io.prometheus:simpleclient_servlet:0.15.0' - implementation 'io.prometheus:simpleclient_httpserver:0.15.0' - implementation 'io.prometheus:simpleclient_hotspot:0.15.0' - implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.2.1' + implementation 'com.fasterxml.jackson.core:jackson-databind:2.13.3' implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30' implementation 'com.syncthemall:boilerpipe:1.2.2' implementation 'com.github.luben:zstd-jni:1.5.2-2' - implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.3.0' + implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0' implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14' implementation 'org.imgscalr:imgscalr-lib:4.2' @@ -111,10 +112,33 @@ dependencies { implementation 'edu.stanford.nlp:stanford-corenlp:4.4.0' implementation group: 'it.unimi.dsi', name: 'fastutil', version: '8.5.8' - implementation 'org.roaringbitmap:RoaringBitmap:[0.6,)' + implementation 'org.roaringbitmap:RoaringBitmap:0.9.27' implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29' implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0' + + testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' + testCompileOnly 'org.projectlombok:lombok:1.18.24' + testImplementation 'org.projectlombok:lombok:1.18.24' + testAnnotationProcessor 'org.projectlombok:lombok:1.18.24' + + e2eTestImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2' + e2eTestRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' + e2eTestImplementation 'org.projectlombok:lombok:1.18.24' + e2eTestAnnotationProcessor 'org.projectlombok:lombok:1.18.22' + e2eTestImplementation 'org.testcontainers:mariadb:1.17.1' + e2eTestImplementation 'org.testcontainers:nginx:1.17.1' + e2eTestImplementation 'org.testcontainers:testcontainers:1.17.1' + e2eTestImplementation "org.testcontainers:junit-jupiter:1.17.1" + e2eTestImplementation "org.testcontainers:selenium:1.17.1" + e2eTestImplementation 'org.seleniumhq.selenium:selenium-remote-driver:4.1.4' + e2eTestImplementation 'org.seleniumhq.selenium:selenium-chrome-driver:4.1.4' +} + +configurations { + e2eTestImplementation.extendsFrom(testImplementation) + } test { @@ -136,4 +160,51 @@ task dbTest(type: Test) { } } +task e2eTest(type: Test) { + maxParallelForks = 1 + forkEvery = 1 + maxHeapSize = "8G" + dependsOn ':shadowJar' + dependsOn 'downloadTestData' + dependsOn 'downloadRDRModelData' + dependsOn 'downloadSentenceModelData' + dependsOn 'downloadTokenModelData' + dependsOn 'downloadTermFreqData' + + classpath = sourceSets.e2eTest.runtimeClasspath + testClassesDirs = sourceSets.e2eTest.output.classesDirs + useJUnitPlatform { + includeTags "e2e" + } +} + +task downloadTestData(type: Download) { + src 'http://hammurabi.acc.umu.se/mirror/kiwix.org/zim/wikipedia/wikipedia_en_100_nopic_2022-05.zim' + dest file('data/test/wikipedia_en_100_nopic.zim') + overwrite false +} + +task downloadRDRModelData(type: Download) { + src (['https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT', + 'https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR']) + dest file('data/models/') + overwrite false +} + +task downloadSentenceModelData(type: Download) { + src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin' + dest file('data/models/opennlp-sentence.bin') + overwrite false +} +task downloadTokenModelData(type: Download) { + src 'https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin' + dest file('data/models/opennlp-tokens.bin') + overwrite false +} + +task downloadTermFreqData(type: Copy) { + // TODO: Need hosting for this file + from '/var/lib/wmsa/model/tfreq-new-algo3.bin' + into 'data/models' +} diff --git a/marginalia_nu/data/.gitignore b/marginalia_nu/data/.gitignore new file mode 100644 index 00000000..f59ec20a --- /dev/null +++ b/marginalia_nu/data/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/marginalia_nu/data/models/.gitignore b/marginalia_nu/data/models/.gitignore new file mode 100644 index 00000000..f59ec20a --- /dev/null +++ b/marginalia_nu/data/models/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/marginalia_nu/data/test/.gitignore b/marginalia_nu/data/test/.gitignore new file mode 100644 index 00000000..f59ec20a --- /dev/null +++ b/marginalia_nu/data/test/.gitignore @@ -0,0 +1 @@ +* \ No newline at end of file diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java new file mode 100644 index 00000000..915b1b76 --- /dev/null +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -0,0 +1,197 @@ +package nu.marginalia.wmsa.edge; + + +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import nu.marginalia.wmsa.edge.crawling.CrawlJobExtractorMain; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.openqa.selenium.By; +import org.openqa.selenium.chrome.ChromeOptions; +import org.openzim.ZIMTypes.ZIMFile; +import org.openzim.ZIMTypes.ZIMReader; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.*; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.MountableFile; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.wmsa.configuration.ServiceDescriptor.*; +import static org.testcontainers.containers.BrowserWebDriverContainer.VncRecordingMode.RECORD_ALL; + +@Tag("e2e") +@Testcontainers +public class EdgeSearchE2ETest { + Network network = Network.newNetwork(); + + @Container + public GenericContainer mariaDB = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withInitScript("sql/edge-crawler-cache.sql") + .withNetwork(network) + .withNetworkAliases("mariadb"); + + @Container + public GenericContainer searchContainer = forService(EDGE_SEARCH); + @Container + public GenericContainer assistantContainer = forService(EDGE_ASSISTANT); + @Container + public GenericContainer indexContainer = forService(EDGE_INDEX); + + @Container + public NginxContainer mockWikipedia = new NginxContainer<>("nginx:stable") + .dependsOn(searchContainer) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("wikipedia"))) + .withFileSystemBind(getWikipediaFiles(), "/usr/share/nginx/html/", BindMode.READ_ONLY) + .withNetwork(network) + .withNetworkAliases("wikipedia"); + + + @Container + public BrowserWebDriverContainer chrome = new BrowserWebDriverContainer<>() + .withNetwork(network) + .withCapabilities(new ChromeOptions()); + + @Container + public GenericContainer crawlerContainer = new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mockWikipedia) + .dependsOn(indexContainer) + .withNetwork(network) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) + .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh") + .withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE) + .withCommand("sh", "crawl.sh") + .waitingFor(Wait.forLogMessage(".*ALL DONE.*", 1).withStartupTimeout(Duration.ofMinutes(10))); + + @Container + public NginxContainer proxyNginx = new NginxContainer<>("nginx:stable") + .dependsOn(searchContainer) + .dependsOn(crawlerContainer) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("nginx"))) + .withCopyFileToContainer(MountableFile.forClasspathResource("nginx/search.conf"), "/etc/nginx/conf.d/default.conf") + .withNetwork(network) + .withNetworkAliases("proxyNginx"); + ; + public GenericContainer forService(ServiceDescriptor service) { + return new GenericContainer<>("openjdk:17-alpine") + .dependsOn(mariaDB) + .withCopyFileToContainer(jarFile(), "/WMSA.jar") + .withCopyFileToContainer(MountableFile.forClasspathResource("init.sh"), "/init.sh") + .withExposedPorts(service.port) + .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withNetwork(network) + .withNetworkAliases(service.name) + .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger(service.name))) + .withCommand("sh", "init.sh", service.name) + .waitingFor(Wait.forHttp("/internal/ping") + .forPort(service.port) + .withReadTimeout(Duration.ofSeconds(15))) + ; + } + + public static MountableFile jarFile() { + Path cwd = Path.of(System.getProperty("user.dir")); + + cwd = cwd.resolve(".."); + var jarFile = cwd.resolve("build/libs/wmsa-SNAPSHOT-all.jar"); + if (!Files.exists(jarFile)) { + System.err.println("Could not find jarFile " + jarFile); + throw new RuntimeException(); + } + else { + System.out.println("jar file = " + jarFile); + } + return MountableFile.forHostPath(jarFile); + } + + public static String modelsPath() { + Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models"); + if (!Files.isDirectory(modelsPath)) { + System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); + throw new RuntimeException(); + } + return modelsPath.toString(); + } + + private Path getCrawlPath() { + return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); + } + + private String getWikipediaFiles() { + Path wikipediaFiles = Path.of(System.getProperty("user.dir")).resolve("build/tmp/wikipedia"); + Path crawlFiles = getCrawlPath(); + Path zimFile = Path.of(System.getProperty("user.dir")).resolve("data/test/wikipedia_en_100_nopic.zim"); + + + List urls = new ArrayList<>(); + try { + Files.deleteIfExists(wikipediaFiles); + Files.createDirectories(wikipediaFiles); + Files.createDirectories(crawlFiles); + + Files.writeString(crawlFiles.resolve("crawl.plan"), """ + jobSpec: "/crawl/crawl.spec" + crawl: + dir: "/crawl/crawl" + logName: "crawl.log" + process: + dir: "/crawl/process" + logName: "process.log" + """); + + Files.createDirectories(crawlFiles.resolve("crawl")); + Files.createDirectories(crawlFiles.resolve("process")); + Files.deleteIfExists(crawlFiles.resolve("process").resolve("process.log")); + Files.deleteIfExists(crawlFiles.resolve("crawl").resolve("crawl.log")); + + var zr = new ZIMReader(new ZIMFile(zimFile.toString())); + zr.forEachArticles((url, art) -> { + urls.add("http://wikipedia/" + url + ".html"); + + if (art != null) { + try { + var doc = Jsoup.parse(art); + doc.getElementsByTag("script").remove(); + Files.writeString(wikipediaFiles.resolve(url+".html"), doc.html()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + }, pred -> true); + urls.forEach(System.out::println); + Files.writeString(wikipediaFiles.resolve("index.html"), ""); + CrawlJobExtractorMain.writeSpec(crawlFiles.resolve("crawl.spec"), "wikipedia", urls); + } + catch (IOException ex) { + ex.printStackTrace(); + } + return wikipediaFiles.toString(); + } + + @Test + public void run() { + var driver = chrome.getWebDriver(); + + driver.get("http://proxyNginx/"); + System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + + driver.get("http://proxyNginx/search?query=bird&profile=corpo"); + System.out.println(driver.getTitle()); + System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); + } +} diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh new file mode 100644 index 00000000..411fe708 --- /dev/null +++ b/marginalia_nu/src/e2e/resources/crawl.sh @@ -0,0 +1,78 @@ +#!/bin/bash +mkdir -p /var/lib/wmsa/conf/ + +cat > /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/hosts < /var/lib/wmsa/suggestions.txt < /var/lib/wmsa/db.properties < /var/lib/wmsa/conf/ranking-settings.yaml < /var/lib/wmsa/conf/hosts < hostsMap = new HashMap<>(ServiceDescriptor.values().length); + + public HostsFile(Path fileName) throws IOException { + var lines = Files.readAllLines(fileName); + for (var line : lines) { + if (line.startsWith("#") || line.isBlank()) { + continue; + } + String[] parts = line.strip().split(" "); + if (parts.length != 2) throw new IllegalArgumentException("Invalid hosts file entry " + line); + String descriptorName = parts[0]; + String hostName = parts[1]; + + try { + hostsMap.put(ServiceDescriptor.byName(descriptorName), hostName); + } + catch (IllegalArgumentException ex) { + throw new IllegalArgumentException("ServiceDescriptor " + descriptorName + " invalid"); + } + } + } + + public HostsFile() { + for (var sd : ServiceDescriptor.values()) { + hostsMap.put(sd, "localhost"); + } + } + + public String getHost(ServiceDescriptor sd) { + return hostsMap.get(sd); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 9bfe2a2b..165272de 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -21,6 +21,9 @@ import nu.marginalia.wmsa.resource_store.ResourceStoreMain; import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain; import org.apache.logging.log4j.core.lookup.MainMapLookup; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -49,13 +52,21 @@ public enum ServiceDescriptor { TEST_1("test-1", 0, null), TEST_2("test-2", 0, null); + private static HostsFile hostsFile; + public synchronized String getHost() { + if (hostsFile == null) { + hostsFile = WmsaHome.getHostsFile(); + } + return hostsFile.getHost(this); + } + public static ServiceDescriptor byName(String name) { for (var v : values()) { if (v.name.equals(name)) { return v; } } - throw new IllegalArgumentException(name); + throw new IllegalArgumentException("Invalid ServiceDescriptor " + name); } public final String name; public final Class mainClass; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java index f749c9a6..2a96de20 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java @@ -1,16 +1,31 @@ package nu.marginalia.wmsa.configuration; +import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; public class WmsaHome { private static final String DEFAULT = "/var/lib/wmsa"; - public static Path get() { + public static Path getHomePath() { var ret = Path.of(System.getProperty("WMSA_HOME", DEFAULT)); if (!Files.isDirectory(ret)) { throw new IllegalStateException("Could not find WMSA_HOME, either set environment variable or ensure " + DEFAULT + " exists"); } return ret; } + + public static HostsFile getHostsFile() { + Path hostsFile = getHomePath().resolve("conf/hosts"); + if (Files.isRegularFile(hostsFile)) { + try { + return new HostsFile(hostsFile); + } catch (IOException e) { + throw new RuntimeException("Failed to load hosts file " + hostsFile, e); + } + } + else { + return new HostsFile(); + } + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java index 8ce96c3a..609058ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/module/DatabaseModule.java @@ -35,7 +35,7 @@ public class DatabaseModule extends AbstractModule { } private Properties loadDbProperties() { - Path propDir = WmsaHome.get().resolve("db.properties"); + Path propDir = WmsaHome.getHomePath().resolve("db.properties"); if (!Files.isRegularFile(propDir)) { throw new IllegalStateException("Database properties file " + propDir + " does not exist"); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java index 84d44326..6fe88d08 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java @@ -33,7 +33,7 @@ public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); private final LoaderFactory loaderFactory; private final EdgeIndexClient indexClient; - private final boolean running = true; + private volatile boolean running = true; final Thread processorThread = new Thread(this::processor, "Processor Thread"); @@ -82,8 +82,11 @@ public class LoaderMain { load(entry.path(), entry.cnt()); }); + running = false; processorThread.join(); indexClient.close(); + + System.exit(0); } private volatile static int loadTotal; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java new file mode 100644 index 00000000..050152bc --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java @@ -0,0 +1,81 @@ +package nu.marginalia.wmsa.edge.converting; + +import nu.marginalia.wmsa.configuration.ServiceDescriptor; +import nu.marginalia.wmsa.configuration.module.DatabaseModule; +import okhttp3.MediaType; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import okhttp3.RequestBody; +import okio.BufferedSink; +import org.jetbrains.annotations.Nullable; + +import java.io.IOException; +import java.net.URL; +import java.nio.charset.Charset; +import java.sql.SQLException; +import java.util.concurrent.TimeUnit; + +import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH; + +public class ReindexTriggerMain { + + public static void main(String... args) throws IOException, SQLException { + var db = new DatabaseModule(); + var client = new OkHttpClient.Builder() + .connectTimeout(100, TimeUnit.MILLISECONDS) + .readTimeout(15, TimeUnit.MINUTES) + .retryOnConnectionFailure(true) + .followRedirects(true) + .build(); + + try (var ds = db.provideConnection(); var conn = ds.getConnection(); var stmt = conn.createStatement()) { + var rs = stmt.executeQuery("SELECT ID, URL_PART, STATE, INDEXED FROM EC_DOMAIN LIMIT 100"); + while (rs.next()) { + System.out.printf("%d %s %s %d\n", + rs.getInt(1), + rs.getString(2), + rs.getString(3), + rs.getInt(4)); + } + + rs = stmt.executeQuery("SELECT ID, DOMAIN_ID, URL, VISITED, STATE FROM EC_URL LIMIT 100"); + while (rs.next()) { + System.out.printf("%d %d %s %d %s\n", + rs.getInt(1), + rs.getInt(2), + rs.getString(3), + rs.getInt(4), + rs.getString(5)); + + } + + stmt.executeUpdate("INSERT IGNORE INTO DOMAIN_METADATA(ID,GOOD_URLS,KNOWN_URLS,VISITED_URLS) SELECT ID,0,0,0 FROM EC_DOMAIN WHERE INDEXED>0"); + stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED AND STATE='ok' GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET GOOD_URLS=CNT"); + stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET KNOWN_URLS=CNT"); + stmt.executeUpdate("UPDATE DOMAIN_METADATA INNER JOIN (SELECT DOMAIN_ID,COUNT(*) CNT FROM EC_URL WHERE VISITED GROUP BY DOMAIN_ID) T ON T.DOMAIN_ID=ID SET VISITED_URLS=CNT"); + } + + var rb = new RequestBody() { + + @Nullable + @Override + public MediaType contentType() { + return MediaType.parse("text/plain"); + } + + @Override + public void writeTo(BufferedSink sink) throws IOException { + sink.writeString("NOOP", Charset.defaultCharset()); + } + }; + + client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute(); + client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute(); + for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) { + client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute(); + } + + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java index d370b3c0..46d71505 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java @@ -25,6 +25,8 @@ public class IndexLoadKeywords implements Runnable { private final Thread runThread; private volatile boolean canceled = false; + private static final int index = Integer.getInteger("keyword-index", 1); + @Inject public IndexLoadKeywords(EdgeIndexClient client) { this.client = client; @@ -37,7 +39,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, 1).blockingSubscribe(); + client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 6f1037ba..324085fc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -79,15 +79,21 @@ public class DocumentProcessor { ret.url = new EdgeUrl(crawledDocument.url); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); - if (ret.state == EdgeUrlState.OK && isAcceptedContentType(crawledDocument)) { - var detailsWords = createDetails(crawledDomain, crawledDocument); + if (ret.state == EdgeUrlState.OK) { - if (detailsWords.details().quality < minDocumentQuality) { - throw new DisqualifiedException(DisqualificationReason.QUALITY); + if (isAcceptedContentType(crawledDocument)) { + var detailsWords = createDetails(crawledDomain, crawledDocument); + + if (detailsWords.details().quality < minDocumentQuality) { + throw new DisqualifiedException(DisqualificationReason.QUALITY); + } + + ret.details = detailsWords.details(); + ret.words = detailsWords.words(); + } + else { + throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE); } - - ret.details = detailsWords.details(); - ret.words = detailsWords.words(); } else { throw new DisqualifiedException(DisqualificationReason.STATUS); @@ -95,7 +101,7 @@ public class DocumentProcessor { } catch (DisqualifiedException ex) { ret.state = EdgeUrlState.DISQUALIFIED; - logger.debug("Disqualified {}: {}", ret.url, ex.reason); + logger.info("Disqualified {}: {}", ret.url, ex.reason); } catch (Exception ex) { ret.state = EdgeUrlState.DISQUALIFIED; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java index c407fbe8..2f25d6d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java @@ -73,7 +73,7 @@ public class CrawlJobExtractorMain { private final EdgeDomainBlacklistImpl blacklist; private final Connection conn; - private final HashFunction hasher = Hashing.murmur3_128(0); + private static final HashFunction hasher = Hashing.murmur3_128(0); public static void main(String... args) throws SQLException, IOException { Driver driver = new Driver(); @@ -97,6 +97,19 @@ public class CrawlJobExtractorMain { } } + public static void writeSpec(Path outFile, String domain, List urls) throws IOException { + Gson gson = new GsonBuilder().create(); + + try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) { + var job = new CrawlingSpecification(); + job.crawlDepth = urls.size(); + job.domain = domain; + job.id = createId(new EdgeDomain(domain)); + job.urls = urls; + out.println(gson.toJson(job)); + } + } + private record DomainWithId(String domainName, int id) {} private Stream extractDomains() { @@ -186,11 +199,11 @@ public class CrawlJobExtractorMain { return spec; } - private String createId(DomainWithId domainWithId) { + private static String createId(DomainWithId domainWithId) { return hasher.hashUnencodedChars(domainWithId.domainName).toString(); } - private String createId(EdgeDomain domain) { + private static String createId(EdgeDomain domain) { return hasher.hashUnencodedChars(domain.toString()).toString(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java index ea62e742..d81e348b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlerMain.java @@ -79,6 +79,9 @@ public class CrawlerMain implements AutoCloseable { try (var crawler = new CrawlerMain(plan)) { crawler.run(); } + + // TODO (2022-05-24): Some thread isn't set to daemon mode, need to explicitly harakiri the process, find why? + System.exit(0); } private CrawledDomain fetchDomain(CrawlingSpecification specification) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 467376f5..87d4f3df 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -20,7 +20,7 @@ import java.time.LocalDateTime; import java.util.*; public class CrawlerRetreiver { - private static final long DEFAULT_CRAWL_DELAY_MS = 1000; + private static final long DEFAULT_CRAWL_DELAY_MS = Long.getLong("defaultCrawlDelay", 1000); private final LinkedList queue = new LinkedList<>(); private final HttpFetcher fetcher; private final HashSet visited; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java index 3c65464e..b590af55 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java @@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.index; import com.google.inject.Inject; import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.service.index.ConversionUnnecessaryException; public class EdgeIndexControl { @@ -19,11 +20,15 @@ public class EdgeIndexControl { System.gc(); for (IndexBlock block : IndexBlock.values()) { + try { + servicesFactory.getIndexConverter(id, block); - servicesFactory.getIndexConverter(id, block); + System.runFinalization(); + System.gc(); + } + catch (ConversionUnnecessaryException unnecessary) { - System.runFinalization(); - System.gc(); + } } System.runFinalization(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java index d06a7b22..986f1874 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexModule.java @@ -13,12 +13,18 @@ public class EdgeIndexModule extends AbstractModule { public void configure() { - bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); + if (Boolean.getBoolean("small-ram")) { + bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27); + } + else { + bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31); + } + } @Provides public RankingSettings rankingSettings() { - Path dir = WmsaHome.get().resolve("conf/ranking-settings.yaml"); + Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml"); return RankingSettings.from(dir); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java index 9b26989c..fb58ac0e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java @@ -88,8 +88,8 @@ public class IndexServicesFactory { return new DictionaryReader(getDictionaryWriter()); } - @SneakyThrows - public SearchIndexConverter getIndexConverter(int id, IndexBlock block) { + + public SearchIndexConverter getIndexConverter(int id, IndexBlock block) throws ConversionUnnecessaryException { return new SearchIndexConverter(block, id, tmpFileDir, preconverterOutputFile.get(id), indexWriteWordsFile.get(id, block.id), @@ -146,14 +146,17 @@ public class IndexServicesFactory { public Callable switchFilesJob(int id) { return () -> { for (int block = 0; block < IndexBlock.values().length; block++) { - Files.move( - indexWriteWordsFile.get(id, block).toPath(), - indexReadWordsFile.get(id, block).toPath(), - StandardCopyOption.REPLACE_EXISTING); - Files.move( - indexWriteUrlsFile.get(id, block).toPath(), - indexReadUrlsFile.get(id, block).toPath(), - StandardCopyOption.REPLACE_EXISTING); + if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) && + Files.exists(indexWriteUrlsFile.get(id, block).toPath())) { + Files.move( + indexWriteWordsFile.get(id, block).toPath(), + indexReadWordsFile.get(id, block).toPath(), + StandardCopyOption.REPLACE_EXISTING); + Files.move( + indexWriteUrlsFile.get(id, block).toPath(), + indexReadUrlsFile.get(id, block).toPath(), + StandardCopyOption.REPLACE_EXISTING); + } } return true; }; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java new file mode 100644 index 00000000..fd7f529f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/ConversionUnnecessaryException.java @@ -0,0 +1,10 @@ +package nu.marginalia.wmsa.edge.index.service.index; + +public class ConversionUnnecessaryException extends Exception { + public ConversionUnnecessaryException() { + + } + + @Override + public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java index 95a47a69..c9b69386 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/service/index/SearchIndexConverter.java @@ -61,6 +61,7 @@ public class SearchIndexConverter { @Named("edge-index-write-urls-file") File outputFileUrls, SearchIndexPartitioner partitioner, EdgeDomainBlacklist blacklist) + throws ConversionUnnecessaryException { this.block = block; this.bucketId = bucketId; @@ -77,16 +78,21 @@ public class SearchIndexConverter { this.fileLength = raf.readLong(); this.wordCount = raf.readInt(); + if (fileLength <= FILE_HEADER_SIZE) { + throw new ConversionUnnecessaryException(); + } + var inputChannel = raf.getChannel(); ByteBuffer buffer = ByteBuffer.allocateDirect(10_000); - urlsFileSize = getUrlsSize(buffer, raf); + urlsFileSize = getUrlsSize(buffer, inputChannel); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); + var urlsTmpFileRaf = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); - urlsTmpFileChannel = new RandomAccessFile(tmpUrlsFile.toFile(), "rw").getChannel(); + urlsTmpFileChannel = urlsTmpFileRaf.getChannel(); urlsTmpFileMap = new MultimapFileLong(urlsTmpFileRaf, FileChannel.MapMode.READ_WRITE, urlsFileSize, 8*1024*1024, false); urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, 1024*1024*256); @@ -114,6 +120,139 @@ public class SearchIndexConverter { } + private long getUrlsSize(ByteBuffer buffer, FileChannel channel) throws IOException { + channel.position(FILE_HEADER_SIZE); + + var reader = new IndexReader(buffer, channel) { + public long size; + + @Override + public void eachWord(long urlId, int wordId) { + size++; + } + }; + + reader.read(); + + logger.info("Blacklist filtered {} URLs", reader.filtered); + logger.debug("URLs Size {} Mb", channel.position()/(1024*1024)); + + return reader.size; + } + + private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { + logger.debug("Table size = {}", wordIndexTable.length); + int[] wordIndex = new int[wordIndexTable.length]; + raf.seek(FILE_HEADER_SIZE); + + var channel = raf.getChannel(); + + try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { + var reader = new IndexReader(buffer, channel) { + @Override + public void eachWord(long urlId, int wordId) throws IOException { + if (wordId >= wordIndex.length) + return; + + if (wordId != 0) { + if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { + logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", + wordId, + wordIndex[wordId], + wordIndexTable[wordId - 1], + wordIndexTable[wordId]); + throw new IllegalStateException(); + } + } + if (wordId > 0) { + rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); + } else { + rwf.put(wordIndex[wordId]++, translateUrl(urlId)); + } + } + }; + + reader.read(); + + rwf.write(urlsTmpFileChannel); + } + + urlsTmpFileChannel.force(false); + + logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); + + if (wordIndexTable.length > 0) { + logger.debug("Sorting urls table"); + sortUrls(wordIndexTable); + urlsTmpFileMap.force(); + } + else { + logger.warn("urls table empty -- nothing to sort"); + } + + + long idx = 0; + + try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { + var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); + + if (wordIndexTable[0] != 0) { + int start = 0; + int end = (int) wordIndexTable[0]; + + idx += writer.write(idx, (int) wordIndexTable[0], + offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); + } + + for (int i = 1; i < wordIndexTable.length; i++) { + if (wordIndexTable[i] != wordIndexTable[i - 1]) { + long start = wordIndexTable[i-1]; + long end = wordIndexTable[i]; + + idx += writer.write(idx, (int) (end-start), + offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + } + + @SneakyThrows + private void sortUrls(long[] wordIndices) { + urlTmpFileSorter.sort( 0, (int) wordIndices[0]); + + for (int i = 1; i < wordIndices.length; i++) { + urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); + } + } + + private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { + inputChannel.position(FILE_HEADER_SIZE); + + logger.debug("Table size = {}", wordCount); + WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); + ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); + + logger.debug("Reading words"); + + var reader = new IndexReader(buffer, inputChannel) { + @Override + public void eachWord(long urlId, int wordId) { + wordsTableWriter.acceptWord(wordId); + } + }; + reader.read(); + + logger.debug("Rearranging table"); + + inputChannel.position(FILE_HEADER_SIZE); + + wordsTableWriter.write(outputFileWords); + + return wordsTableWriter.getTable(); + } + @RequiredArgsConstructor private class IndexReader { private final ByteBuffer buffer; @@ -193,7 +332,7 @@ public class SearchIndexConverter { public void eachUrl(Lock lock, int count, long urlId) throws IOException { for (int i = 0; i < count; i++) { int wordId = buffer.getInt(); - if (acceptWord(lock, urlId, wordId, i, block.id)) { + if (acceptWord(lock, urlId)) { eachWord(urlId, wordId); } } @@ -201,183 +340,16 @@ public class SearchIndexConverter { public void eachWord(long urlId, int wordId) throws IOException { } - } - private long getUrlsSize(ByteBuffer buffer, RandomAccessFile raf) throws IOException { - raf.seek(FILE_HEADER_SIZE); + boolean acceptWord(Lock lock, long urlId) { + int domainId = (int) (urlId >>> 32L); - var channel = raf.getChannel(); - - var reader = new IndexReader(buffer, channel) { - public long size; - - @Override - public void eachWord(long urlId, int wordId) { - size++; - } - }; - - reader.read(); - - logger.info("Blacklist filtered {} URLs", reader.filtered); - logger.debug("URLs Size {} Mb", channel.position()/(1024*1024)); - - return reader.size; - } - - private void createUrlTable(Path tmpFileDir, ByteBuffer buffer, RandomAccessFile raf, long[] wordIndexTable) throws IOException { - logger.debug("Table size = {}", wordIndexTable.length); - int[] wordIndex = new int[wordIndexTable.length]; - raf.seek(FILE_HEADER_SIZE); - - var channel = raf.getChannel(); - - try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, urlsFileSize, 10_000_000)) { - var reader = new IndexReader(buffer, channel) { - @Override - public void eachWord(long urlId, int wordId) throws IOException { - if (wordId >= wordIndex.length) - return; - - if (wordId != 0) { - if (!(wordIndexTable[wordId - 1] + wordIndex[wordId] <= wordIndexTable[wordId])) { - logger.error("Crazy state: wordId={}, index={}, lower={}, upper={}", - wordId, - wordIndex[wordId], - wordIndexTable[wordId - 1], - wordIndexTable[wordId]); - throw new IllegalStateException(); - } - } - if (wordId > 0) { - rwf.put(wordIndexTable[wordId - 1] + wordIndex[wordId]++, translateUrl(urlId)); - } else { - rwf.put(wordIndex[wordId]++, translateUrl(urlId)); - } - } - }; - - reader.read(); - - rwf.write(urlsTmpFileChannel); - } - - urlsTmpFileChannel.force(false); - - logger.debug("URL TMP Table: {} Mb", channel.position()/(1024*1024)); - - if (wordIndexTable.length > 0) { - logger.debug("Sorting urls table"); - sortUrls(wordIndexTable); - urlsTmpFileMap.force(); - } - else { - logger.warn("urls table empty -- nothing to sort"); - } - - - long idx = 0; - - var copyBuffer = ByteBuffer.allocateDirect(4096); - try (var urlsFileMap = MultimapFileLong.forOutput(urlsFile.toPath(), 1024)) { - var writer = new BTreeWriter(urlsFileMap, urlsBTreeContext); - - if (wordIndexTable[0] != 0) { - int start = 0; - int end = (int) wordIndexTable[0]; - - idx += writer.write(idx, (int) wordIndexTable[0], - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); + if (!partitioner.filterUnsafe(lock, domainId, bucketId)) { + return false; } - for (int i = 1; i < wordIndexTable.length; i++) { - if (wordIndexTable[i] != wordIndexTable[i - 1]) { - long start = wordIndexTable[i-1]; - long end = wordIndexTable[i]; - - idx += writer.write(idx, (int) (end-start), - offset -> urlsFileMap.transferFromFileChannel(urlsTmpFileChannel, offset, start, end)); - } - } - } catch (Exception e) { - e.printStackTrace(); + return true; } - - logger.warn("BTrees generated"); - } - - public void transfer(ByteBuffer buffer, MultimapFileLong dest, FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException { - int tbw = 0; - - buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd - sourceStart)*8)); - while (sourceEnd - sourceStart - tbw > buffer.limit()/8) { - int bw = 0; - while (buffer.position() < buffer.limit()) { - int r = sourceChannel.read(buffer, sourceStart*8 + bw); - if (r < 0) { - throw new IOException(""); - } - bw += r; - } - buffer.flip(); - dest.write(buffer.asLongBuffer(), destOffset + tbw); - tbw += bw/8; - buffer.clear(); - buffer.limit(Math.min(buffer.capacity(), (int)(sourceEnd*8 - sourceStart*8 - tbw))); - } - buffer.clear(); - buffer.limit((int)(sourceEnd - (sourceStart + tbw))*8); - int bw = 0; - while (bw < buffer.limit()) { - bw += sourceChannel.read(buffer, sourceStart + bw); - } - buffer.flip(); - dest.write(buffer.asLongBuffer(), destOffset + tbw); - } - - @SneakyThrows - private void sortUrls(long[] wordIndices) { - urlTmpFileSorter.sort( 0, (int) wordIndices[0]); - - for (int i = 1; i < wordIndices.length; i++) { - urlTmpFileSorter.sort(wordIndices[i-1], (int) (wordIndices[i] - wordIndices[i-1])); - } - } - - private long[] createWordIndexTable(File outputFileWords, FileChannel inputChannel) throws Exception { - inputChannel.position(FILE_HEADER_SIZE); - - logger.debug("Table size = {}", wordCount); - WordsTableWriter wordsTableWriter = new WordsTableWriter(wordCount); - ByteBuffer buffer = ByteBuffer.allocateDirect(8*SearchIndexWriterImpl.MAX_BLOCK_SIZE); - - logger.debug("Reading words"); - - var reader = new IndexReader(buffer, inputChannel) { - @Override - public void eachWord(long urlId, int wordId) { - wordsTableWriter.acceptWord(wordId); - } - }; - reader.read(); - - logger.debug("Rearranging table"); - - inputChannel.position(FILE_HEADER_SIZE); - - wordsTableWriter.write(outputFileWords); - - return wordsTableWriter.getTable(); - } - - boolean acceptWord(Lock lock, long urlId, int wordId, int wordIdx, int block) { - int domainId = (int) (urlId >>> 32L); - - if (!partitioner.filterUnsafe(lock, domainId, bucketId)) { - return false; - } - - return true; } } diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index 2f706bba..1460375c 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -18,7 +18,14 @@ DROP VIEW IF EXISTS EC_URL_PART_HASH; DROP TABLE IF EXISTS EC_URL_WORD; DROP TABLE IF EXISTS EC_DICTIONARY; +DROP TABLE IF EXISTS DOMAIN_METADATA; +CREATE TABLE IF NOT EXISTS DOMAIN_METADATA ( + ID INT PRIMARY KEY, + KNOWN_URLS INT DEFAULT 0, + VISITED_URLS INT DEFAULT 0, + GOOD_URLS INT DEFAULT 0 +); CREATE TABLE IF NOT EXISTS EC_TOP_DOMAIN ( ID INT PRIMARY KEY AUTO_INCREMENT, diff --git a/marginalia_nu/src/test/java/EmptyTest.java b/marginalia_nu/src/test/java/EmptyTest.java deleted file mode 100644 index e789f2cf..00000000 --- a/marginalia_nu/src/test/java/EmptyTest.java +++ /dev/null @@ -1,8 +0,0 @@ -import org.junit.jupiter.api.Test; - -public class EmptyTest { - @Test - public void test() { - - } -} diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java index 2d83c3c9..2340492e 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestLanguageModels.java @@ -1,5 +1,6 @@ package nu.marginalia.util; +import nu.marginalia.wmsa.configuration.WmsaHome; import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels; import java.nio.file.Files; @@ -7,10 +8,9 @@ import java.nio.file.Path; import java.util.Optional; public class TestLanguageModels { - private static final Path LANGUAGE_MODELS_DEFAULT = Path.of("/home/vlofgren/Work/ngrams/"); - - public static LanguageModels getLanguageModels() { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + public static Path getLanguageModelsPath() { final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) .map(Path::of) .orElse(LANGUAGE_MODELS_DEFAULT); @@ -18,14 +18,20 @@ public class TestLanguageModels { if (!Files.isDirectory(languageModelsHome)) { throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( languageModelsHome.resolve("ngrams-generous-emstr.bin"), languageModelsHome.resolve("tfreq-generous-emstr.bin"), - languageModelsHome.resolve("opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), - languageModelsHome.resolve("opennlp-tok.bin") + languageModelsHome.resolve("opennlp-tokens.bin") ); } } diff --git a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java index 2e9c6bc4..84b9f165 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java +++ b/marginalia_nu/src/test/java/nu/marginalia/util/TestUtil.java @@ -18,8 +18,13 @@ public class TestUtil { @SneakyThrows public static HikariDataSource getConnection() { + return getConnection("jdbc:mysql://localhost:3306/WMSA_test"); + } + + @SneakyThrows + public static HikariDataSource getConnection(String connString) { HikariConfig config = new HikariConfig(); - config.setJdbcUrl("jdbc:mysql://localhost:3306/WMSA_test"); + config.setJdbcUrl(connString); config.setUsername("wmsa"); config.setPassword("wmsa"); config.setMaximumPoolSize(16); @@ -29,6 +34,7 @@ public class TestUtil { return new HikariDataSource(config); } + @SneakyThrows public static void evalScript(HikariDataSource hds, String scriptFile) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java new file mode 100644 index 00000000..2670039e --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/configuration/HostsFileTest.java @@ -0,0 +1,69 @@ +package nu.marginalia.wmsa.configuration; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +class HostsFileTest { + Path tempFile; + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); + } + + @AfterEach + public void tearDown() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".tmp"); + } + + @Test + public void testParseSunnyDay() throws IOException { + Files.writeString(tempFile, """ + # Comment + edge-index 192.168.0.1 + edge-search 192.168.1.1 + + auth 127.0.0.55 + + + """); + var hf = new HostsFile(tempFile); + + Assertions.assertEquals("192.168.0.1", hf.getHost(ServiceDescriptor.EDGE_INDEX)); + } + + @Test + public void testTooLong() throws IOException { + Files.writeString(tempFile, """ + edge-index 192.168.0.1 this is where my homie lives + """); + + assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile)); + } + + @Test + public void testTooShort() throws IOException { + Files.writeString(tempFile, """ + edge-index + """); + + assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile)); + } + + @Test + public void testBadName() throws IOException { + Files.writeString(tempFile, """ + garum-factory 127.0.0.1 + """); + + assertThrows(IllegalArgumentException.class, () -> new HostsFile(tempFile)); + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java index cf497193..180576fc 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/DictionaryWriterTest.java @@ -42,7 +42,7 @@ class DictionaryWriterTest { System.out.println(hitsTotal); } */ - @Test @Disabled + @Test @Disabled @SneakyThrows public void convert() { new SearchIndexConverter(IndexBlock.Title, 0, Path.of("/tmp"), new File("/home/vlofgren/page-index-0.dat"), diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java index abf7c4a6..f42f2d36 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexConverterTest.java @@ -1,5 +1,6 @@ package nu.marginalia.wmsa.edge.index.service; +import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.service.index.SearchIndexConverter; import nu.marginalia.wmsa.edge.index.service.query.SearchIndexPartitioner; @@ -16,7 +17,7 @@ class SearchIndexConverterTest { private final Logger logger = LoggerFactory.getLogger(getClass()); - @Test @Disabled + @Test @Disabled @SneakyThrows public void test() { // File dictFile = new File("/home/vlofgren/dictionary.dat"); File inFile = new File("/home/vlofgren/Work/converter/3/page-index.dat"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java index 4a1e3e0d..f9cd8a6a 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexWriterTest.java @@ -63,7 +63,7 @@ class SearchIndexWriterTest { return reader.findWord(block, budget, lv->true, dictionaryWriter.getReadOnly(word)).stream().toArray(); } - @Test + @Test @SneakyThrows void put() throws IOException { writer.put(new EdgeId<>(0), new EdgeId<>(1), IndexBlock.Words, Arrays.asList("Hello", "Salvete", "everyone!", "This", "is", "Bob")); writer.put(new EdgeId<>(0), new EdgeId<>(2), IndexBlock.Words, Arrays.asList("Salvete", "omnes!", "Bob", "sum", "Hello")); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java index 20d11538..ce9f59ea 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/BodyQueryParserTest.java @@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.search.query; import nu.marginalia.util.TestLanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.crawler.domain.language.conf.LanguageModels; -import org.junit.BeforeClass; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -17,7 +17,7 @@ class BodyQueryParserTest { private static EnglishDictionary englishDictionary; private static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - @BeforeClass + @BeforeAll public static void init() { dict = new NGramDict(lm); englishDictionary = new EnglishDictionary(dict); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java index 27e3a925..9699bcf9 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextChangeTest.java @@ -12,8 +12,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import org.junit.BeforeClass; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -40,7 +40,7 @@ class GemtextChangeTest { static final Logger logger = LoggerFactory.getLogger(GemtextChangeTest.class); - @BeforeClass + @BeforeAll public static void init() { RxJavaPlugins.setErrorHandler(e -> { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java index 65cff6d6..8aefc613 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTaskUpdateTest.java @@ -14,8 +14,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import org.junit.BeforeClass; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -47,7 +47,7 @@ class GemtextTaskUpdateTest { static final Logger logger = LoggerFactory.getLogger(GemtextTaskUpdateTest.class); - @BeforeClass + @BeforeAll public static void init() { RxJavaPlugins.setErrorHandler(e -> { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java index c2be2141..bfe3b104 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/memex/change/GemtextTombstoneUpdateCaclulatorTest.java @@ -11,8 +11,8 @@ import nu.marginalia.wmsa.memex.system.MemexFileSystemModifiedTimes; import nu.marginalia.wmsa.memex.system.MemexFileWriter; import nu.marginalia.wmsa.memex.system.MemexGitRepo; import nu.marginalia.wmsa.memex.system.MemexSourceFileSystem; -import org.junit.BeforeClass; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -38,7 +38,7 @@ class GemtextTombstoneUpdateCaclulatorTest { static final Logger logger = LoggerFactory.getLogger(GemtextTombstoneUpdateCaclulatorTest.class); - @BeforeClass + @BeforeAll public static void init() { RxJavaPlugins.setErrorHandler(e -> {