diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index 91fe49d4..4a10951e 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -75,6 +75,10 @@ public class WmsaHome { return ret; } + public static Path getDataPath() { + return getHomePath().resolve("data"); + } + public static Path getAdsDefinition() { return getHomePath().resolve("data").resolve("adblock.txt"); } diff --git a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java index ba10c54e..d826a24b 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java +++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java @@ -3,22 +3,33 @@ package nu.marginalia.control; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.WmsaHome; import nu.marginalia.service.MainClass; -import nu.marginalia.service.discovery.ServiceRegistryIf; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; -import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.server.Initialization; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipFile; + public class ControlMain extends MainClass { @Inject public ControlMain(ControlService service) { } - public static void main(String... args) { + public static void main(String... args) throws Exception { init(ServiceId.Control, args); Injector injector = Guice.createInjector( @@ -30,9 +41,97 @@ public class ControlMain extends MainClass { // Orchestrate the boot order for the services var registry = injector.getInstance(ServiceRegistryIf.class); var configuration = injector.getInstance(ServiceConfiguration.class); + + // This must be run before orchestrateBoot, so that the other services don't + // start up until we're done + downloadAncillaryFiles(WmsaHome.getDataPath()); + orchestrateBoot(registry, configuration); + injector.getInstance(ControlMain.class); injector.getInstance(Initialization.class).setReady(); } + + static void downloadAncillaryFiles(Path dataPath) throws Exception { + Path adblockFile = dataPath.resolve("adblock.txt"); + if (!Files.exists(adblockFile)) { + download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt")); + } + + Path suggestionsFile = dataPath.resolve("suggestions.txt"); + if (!Files.exists(suggestionsFile)) { + downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz")); + } + + Path asnRawData = dataPath.resolve("asn-data-raw-table"); + if (!Files.exists(asnRawData)) { + download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table")); + } + + Path asnUsedAutnums = dataPath.resolve("asn-used-autnums"); + if (!Files.exists(asnUsedAutnums)) { + download(asnUsedAutnums, new URI("https://thyme.apnic.net/current/data-used-autnums")); + } + + Path ip2Location = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV"); + Path ip2LocationZip = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"); + + if (!Files.exists(ip2Location)) { + if (Files.exists(ip2LocationZip)) { + Files.delete(ip2LocationZip); + } + + download(ip2LocationZip, new URI("https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP")); + unzip(ip2LocationZip, dataPath, List.of("IP2LOCATION-LITE-DB1.CSV", "README_LITE.TXT", "LICENSE-CC-BY-SA-4.0.TXT")); + Files.deleteIfExists(ip2LocationZip); + } + } + + private static void download(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new BufferedInputStream(source.toURL().openStream())) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + private static void downloadGzipped(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new GZIPInputStream(new BufferedInputStream(source.toURL().openStream()))) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + + private static void unzip(Path inputZip, Path outputDir, Collection fileNames) throws IOException { + try (ZipFile zipFile = new ZipFile(inputZip.toFile())) { + zipFile.stream().forEach(entry -> { + try { + if (fileNames.contains(entry.getName())) { + System.out.println("Extracting " + entry.getName()); + Files.copy(zipFile.getInputStream(entry), outputDir.resolve(entry.getName())); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } } diff --git a/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java new file mode 100644 index 00000000..aca73872 --- /dev/null +++ b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.control; + +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; + +class ControlMainTest { + + @Test + @Disabled("We don't want to rudely hammer 3rd party services with chonky downloads on every build") + void downloadAncillaryFiles() throws Exception { + Path tempDir = Files.createTempDirectory("test"); + + ControlMain.downloadAncillaryFiles(tempDir); + + Assertions.assertTrue(Files.exists(tempDir.resolve("adblock.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("suggestions.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-data-raw-table"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-used-autnums"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("LICENSE-CC-BY-SA-4.0.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("README_LITE.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV"))); + + // We don't want to leave a mess + Assertions.assertFalse(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"))); + + TestUtil.clearTempDir(tempDir); + } +} \ No newline at end of file diff --git a/run/install-noninteractive.sh b/run/install-noninteractive.sh new file mode 100755 index 00000000..063c8a7a --- /dev/null +++ b/run/install-noninteractive.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +### Settings that are passed as environment variables: + +# $INSTANCE_TYPE can be 1, 2, 3, or 4 + +# * 1: barebones interface (1 index node) +# * 2: barebones interface (2 index nodes) +# * 3: full Marginalia Search production-like instance +# * 4: non-docker install + +INSTANCE_TYPE=${INSTANCE_TYPE:-4} + +# $MARIADB_USER and $MARIADB_PASSWORD are used to set up the MariaDB database, +# by default we use the user 'marginalia' and a randomly generated strong password + +MARIADB_USER=${MARIADB_USER:-marginalia} +MARIADB_PASSWORD=${MARIADB_PASSWORD:-$(tr -dc 'A-Za-z0-9' < /dev/urandom | head -c 32)} + +### + +## Check for envsubst +if ! command -v envsubst &> /dev/null +then + echo "The envsubst command could not be found, please install it. It is usually part of GNU gettext." + exit +fi + +## Move to the directory of the script +pushd $(dirname $0) + + +## Check for the install directory +INSTALL_DIR=$(realpath ${1}) + +if [ -z ${INSTALL_DIR} ]; then + echo "Usage: $0 " + exit 1 +fi + +mkdir -p ${INSTALL_DIR} + +echo "** Copying files to ${INSTALL_DIR}" + +for dir in model data conf conf/properties env; do + if [ ! -d ${dir} ]; then + echo "ERROR: ${dir} does not exist" + exit 1 + fi + echo "Copying ${dir}/" + mkdir -p ${INSTALL_DIR}/${dir} + find ${dir} -maxdepth 1 -type f -exec cp -v {} ${INSTALL_DIR}/{} \; +done + +# for barebones, tell the control service to hide the marginalia app specific stuff +if [ "${INSTANCE_TYPE}" == "1" ]; then + echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties +elif [ "${INSTANCE_TYPE}" == "2" ]; then + echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties +elif [ "${INSTANCE_TYPE}" == "4" ]; then + echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties + # (leading with a blank newline is important, as we cannot trust that the source file ends with a new-line) + cat >>${INSTALL_DIR}/conf/properties/system.properties < ${INSTALL_DIR}/env/mariadb.env +envsubst < install/db.properties.template > ${INSTALL_DIR}/conf/db.properties + +echo "** Creating docker-compose.yml" + +## Hack to get around envstubst substituting these values, which we want to be verbatim +export uval="\$\$MARIADB_USER" +export pval="\$\$MARIADB_PASSWORD" + +export INSTALL_DIR + +if [ "${INSTANCE_TYPE}" == "1" ]; then + envsubst < install/barebones-1/docker-compose.yml.template >${INSTALL_DIR}/docker-compose.yml +elif [ "${INSTANCE_TYPE}" == "2" ]; then + envsubst < install/barebones-2/docker-compose.yml.template >${INSTALL_DIR}/docker-compose.yml +elif [ "${INSTANCE_TYPE}" == "3" ]; then + envsubst < install/marginalia-prod-like/docker-compose.yml.template >${INSTALL_DIR}/docker-compose.yml +elif [ "${INSTANCE_TYPE}" == "4" ]; then + envsubst < install/no-docker/docker-compose.yml.template >${INSTALL_DIR}/docker-compose.yml + cp install/no-docker/README ${INSTALL_DIR}/README + + echo + echo "=====" + cat ${INSTALL_DIR}/README + echo + echo "=====" + echo "To read this again, look in ${INSTALL_DIR}/README" + echo +fi + +popd \ No newline at end of file diff --git a/run/install.sh b/run/install.sh index 636ee473..7a5a44e4 100755 --- a/run/install.sh +++ b/run/install.sh @@ -19,12 +19,6 @@ set -e -if ! command -v envsubst &> /dev/null -then - echo "The envsubst command could not be found, please install it. It is usually part of GNU gettext." - exit -fi - if [ -z "${1}" ]; then echo "Usage: $0 " exit 1 @@ -41,7 +35,7 @@ echo echo "1) barebones instance (1 node)" echo "2) barebones instance (2 nodes)" echo "3) full Marginalia Search instance?" -echo "4) non-docker install? (proof-of-concept, not recommended)" +echo "4) non-docker install? (not recommended)" echo read -p "Enter 1, 2, 3, or 4: " INSTANCE_TYPE @@ -57,10 +51,13 @@ echo echo "We're going to set up a Mariadb database in docker, please enter some details" read -p "MariaDB user (e.g. marginalia): " MARIADB_USER -read -s -p "MariaDB password (e.g. hunter2 ;-): " MARIADB_PASSWORD -echo -read -s -p "MariaDB password (again): " MARIADB_PASSWORD2 +read -s -p "MariaDB password (e.g. hunter2, or leave blank to generate one): " MARIADB_PASSWORD echo +if [ ! -z "${MARIADB_PASSWORD}" ]; then + echo + read -s -p "MariaDB password (again): " MARIADB_PASSWORD2 + echo +fi export MARIADB_USER export MARIADB_PASSWORD @@ -79,130 +76,10 @@ echo echo "Will install to ${INSTALL_DIR}" read -p "Press enter to continue, or Ctrl-C to abort" -pushd $(dirname $0) - -./setup.sh ## Ensure that the setup script has been run - -mkdir -p ${INSTALL_DIR} - -echo "** Copying files to ${INSTALL_DIR}" - -for dir in model data conf conf/properties env; do - if [ ! -d ${dir} ]; then - echo "ERROR: ${dir} does not exist" - exit 1 - fi - echo "Copying ${dir}/" - mkdir -p ${INSTALL_DIR}/${dir} - find ${dir} -maxdepth 1 -type f -exec cp -v {} ${INSTALL_DIR}/{} \; -done - -# for barebones, tell the control service to hide the marginalia app specific stuff -if [ "${INSTANCE_TYPE}" == "1" ]; then - echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties -elif [ "${INSTANCE_TYPE}" == "2" ]; then - echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties -elif [ "${INSTANCE_TYPE}" == "4" ]; then - echo "control.hideMarginaliaApp=true" > ${INSTALL_DIR}/conf/properties/control-service.properties - # (leading with a blank newline is important, as we cannot trust that the source file ends with a new-line) - cat >>${INSTALL_DIR}/conf/properties/system.properties < ${INSTALL_DIR}/env/mariadb.env -envsubst < install/db.properties.template > ${INSTALL_DIR}/conf/db.properties - -echo "** Creating docker-compose.yml" - -## Hack to get around envstubst substituting these values, which we want to be verbatim -export uval="\$\$MARIADB_USER" -export pval="\$\$MARIADB_PASSWORD" - export INSTALL_DIR +export INSTANCE_TYPE +export MARIADB_USER +export MARIADB_PASSWORD +export MARIADB_HOST -if [ "${INSTANCE_TYPE}" == "1" ]; then - envsubst < install/docker-compose-barebones-1.yml.template >${INSTALL_DIR}/docker-compose.yml -elif [ "${INSTANCE_TYPE}" == "2" ]; then - envsubst < install/docker-compose-barebones-2.yml.template >${INSTALL_DIR}/docker-compose.yml -elif [ "${INSTANCE_TYPE}" == "3" ]; then - envsubst < install/docker-compose-marginalia.yml.template >${INSTALL_DIR}/docker-compose.yml -elif [ "${INSTANCE_TYPE}" == "4" ]; then - envsubst < install/docker-compose-scaffold.yml.template >${INSTALL_DIR}/docker-compose.yml - -cat < ${INSTALL_DIR}/README -Quick note about running Marginalia Search in a non-docker environment. - -Beware that this installation mode is more of a proof-of-concept and demonstration that the -system is not unhealthily dependent on docker, than a production-ready setup, and is not -recommended for production use! The container setup is much more robust and easier to manage. - -Note: This script only sets up an install directory, and does not build the system. -You will need to build the system with "gradlew assemble" before you can run it. - -Each service is spawned by the same launcher. After building the project with -"gradlew assemble", the launcher is put in "code/services-core/single-service-runner/build/distributions/marginalia.tar". -This needs to be extracted! - -Note: The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside -of docker, but you will need to update the db.properties file and "zookeeper-hosts" in the system.properties -file to point to the correct locations/addresses. - -Running: - -To launch a process you need to unpack it, and then run the launcher with the -appropriate arguments. For example: - -WMSA_HOME=/path/to/install/dir marginalia control:1 127.0.0.1:7000:7001 127.0.0.2 - -This command will start the control partition 1 on ports 7000 (HTTP) and 7001 (GRPC), -bound to 127.0.0.1, and it will announce its presence to the local zookeeper -instance on 127.0.0.2. - -A working setup needs at all the services - -* control [ http port is the control GUI ] -* query [ http port is the query GUI ] -* index [ http port is internal ] -* executor [ http port is internal ] - -Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service. - -* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition -number is the last digit of the service name, and should be positive. You can have multiple pairs of index -and executor partitions, but the pair should run on the same physical machine with the same install directory. - -* The query service can use any partition number. - -* The control service should be on partition 1. - -EOF - -echo -echo "=====" -cat ${INSTALL_DIR}/README -echo -echo "=====" -echo "To read this again, look in ${INSTALL_DIR}/README" -echo -fi - -popd \ No newline at end of file +bash $(dirname $0)/install-noninteractive.sh ${INSTALL_DIR} \ No newline at end of file diff --git a/run/install/docker-compose-barebones-1.yml.template b/run/install/barebones-1/docker-compose.yml.template similarity index 100% rename from run/install/docker-compose-barebones-1.yml.template rename to run/install/barebones-1/docker-compose.yml.template diff --git a/run/install/docker-compose-barebones-2.yml.template b/run/install/barebones-2/docker-compose.yml.template similarity index 100% rename from run/install/docker-compose-barebones-2.yml.template rename to run/install/barebones-2/docker-compose.yml.template diff --git a/run/install/docker-compose-marginalia.yml.template b/run/install/marginalia-prod-like/docker-compose.yml.template similarity index 100% rename from run/install/docker-compose-marginalia.yml.template rename to run/install/marginalia-prod-like/docker-compose.yml.template diff --git a/run/install/no-docker/README b/run/install/no-docker/README new file mode 100644 index 00000000..7894855a --- /dev/null +++ b/run/install/no-docker/README @@ -0,0 +1,44 @@ +Quick note about running Marginalia Search in a non-docker environment. + +Beware that this installation mode is more of a proof-of-concept and demonstration that the +system is not unhealthily dependent on docker, than a production-ready setup, and is not +recommended for production use! The container setup is much more robust and easier to manage. + +Note: This script only sets up an install directory, and does not build the system. +You will need to build the system with "gradlew assemble" before you can run it. + +Each service is spawned by the same launcher. After building the project with +"gradlew assemble", the launcher is put in "code/services-core/single-service-runner/build/distributions/marginalia.tar". +This needs to be extracted! + +Note: The template sets up a sample (in-docker) setup for mariadb and zookeeper. These can also be run outside +of docker, but you will need to update the db.properties file and "zookeeper-hosts" in the system.properties +file to point to the correct locations/addresses. + +Running: + +To launch a process you need to unpack it, and then run the launcher with the +appropriate arguments. For example: + +WMSA_HOME=/path/to/install/dir marginalia control:1 127.0.0.1:7000:7001 127.0.0.2 + +This command will start the control partition 1 on ports 7000 (HTTP) and 7001 (GRPC), +bound to 127.0.0.1, and it will announce its presence to the local zookeeper +instance on 127.0.0.2. + +A working setup needs at all the services + +* control [ http port is the control GUI ] +* query [ http port is the query GUI ] +* index [ http port is internal ] +* executor [ http port is internal ] + +Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service. + +* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition +number is the last digit of the service name, and should be positive. You can have multiple pairs of index +and executor partitions, but the pair should run on the same physical machine with the same install directory. + +* The query service can use any partition number. + +* The control service should be on partition 1. diff --git a/run/install/docker-compose-scaffold.yml.template b/run/install/no-docker/docker-compose.yml.template similarity index 100% rename from run/install/docker-compose-scaffold.yml.template rename to run/install/no-docker/docker-compose.yml.template diff --git a/run/prometheus.yml b/run/install/prometheus.yml similarity index 100% rename from run/prometheus.yml rename to run/install/prometheus.yml diff --git a/run/setup.sh b/run/setup.sh index 1e9fc1b4..d2a46834 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -31,7 +31,7 @@ function download_model { if [ ! -f $model ]; then echo "** $model absent, downloading $url" - curl -s -o $model.tmp $url + curl -L --progress-bar -o $model.tmp $url mv $model.tmp $model fi } @@ -64,24 +64,14 @@ else mkdir -p {node-1,node-2}/{work,index,backup,samples/export,uploads} fi -download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.DICT -download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR -download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin -download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin -download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin a2650796c77968b1bd9db0d7c01e3150 -download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d -download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569 +download_model model/English.DICT https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/e0fa60db14eae90b66dc67691f0f519eb19e3e66/Models/POS/English.DICT 356d96a8832b62eb5e0ddac6f0301ada +download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/e0fa60db14eae90b66dc67691f0f519eb19e3e66/Models/POS/English.RDR bec40a1160e12c33a1dd0563677104e4 -download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP -unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP +download_model model/opennlp-sentence.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin 5965ada99a2ca77beb8632bb47741b7a +download_model model/opennlp-tokens.bin https://downloads.apache.org/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin f097e14bce9edb3f558f6aaf2c3f7622 -download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table -download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums - -download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt -if [ ! -f data/suggestions.txt ]; then - download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz - gunzip data/suggestions.txt.gz -fi +download_model model/segments.bin https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/segments.bin?download=true a2650796c77968b1bd9db0d7c01e3150 +download_model model/tfreq-new-algo3.bin https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/tfreq-new-algo3.bin?download=true a38f0809f983723001dfc784d88ebb6d +download_model model/lid.176.ftz https://huggingface.co/MarginaliaNu/MarginaliaModelData/resolve/c9339e4224f1dfad7f628809c32687e748198ae3/lid.176.ftz?download=true 340156704bb8c8e50c4abf35a7ec2569 popd