From 76e9053dd0ea19bc931a1438b761198ab67205c4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 6 Nov 2024 15:28:20 +0100 Subject: [PATCH] (setup) Move some file-downloads from setup script to the first boot of the control node of the system We can only do this for files that are not required for unit tests. As it is illegal to run more than one instance of the control service, this should be fine with regard to race conditions. The boot orchestration will also ensure that no other services will boot up before the downloading is complete. --- .../config/java/nu/marginalia/WmsaHome.java | 4 + .../nu/marginalia/control/ControlMain.java | 109 +++++++++++++++++- .../marginalia/control/ControlMainTest.java | 33 ++++++ run/setup.sh | 12 -- 4 files changed, 141 insertions(+), 17 deletions(-) create mode 100644 code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index 91fe49d4..4a10951e 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -75,6 +75,10 @@ public class WmsaHome { return ret; } + public static Path getDataPath() { + return getHomePath().resolve("data"); + } + public static Path getAdsDefinition() { return getHomePath().resolve("data").resolve("adblock.txt"); } diff --git a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java index ba10c54e..d826a24b 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java +++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java @@ -3,22 +3,33 @@ package nu.marginalia.control; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.WmsaHome; import nu.marginalia.service.MainClass; -import nu.marginalia.service.discovery.ServiceRegistryIf; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; -import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.server.Initialization; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipFile; + public class ControlMain extends MainClass { @Inject public ControlMain(ControlService service) { } - public static void main(String... args) { + public static void main(String... args) throws Exception { init(ServiceId.Control, args); Injector injector = Guice.createInjector( @@ -30,9 +41,97 @@ public class ControlMain extends MainClass { // Orchestrate the boot order for the services var registry = injector.getInstance(ServiceRegistryIf.class); var configuration = injector.getInstance(ServiceConfiguration.class); + + // This must be run before orchestrateBoot, so that the other services don't + // start up until we're done + downloadAncillaryFiles(WmsaHome.getDataPath()); + orchestrateBoot(registry, configuration); + injector.getInstance(ControlMain.class); injector.getInstance(Initialization.class).setReady(); } + + static void downloadAncillaryFiles(Path dataPath) throws Exception { + Path adblockFile = dataPath.resolve("adblock.txt"); + if (!Files.exists(adblockFile)) { + download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt")); + } + + Path suggestionsFile = dataPath.resolve("suggestions.txt"); + if (!Files.exists(suggestionsFile)) { + downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz")); + } + + Path asnRawData = dataPath.resolve("asn-data-raw-table"); + if (!Files.exists(asnRawData)) { + download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table")); + } + + Path asnUsedAutnums = dataPath.resolve("asn-used-autnums"); + if (!Files.exists(asnUsedAutnums)) { + download(asnUsedAutnums, new URI("https://thyme.apnic.net/current/data-used-autnums")); + } + + Path ip2Location = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV"); + Path ip2LocationZip = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"); + + if (!Files.exists(ip2Location)) { + if (Files.exists(ip2LocationZip)) { + Files.delete(ip2LocationZip); + } + + download(ip2LocationZip, new URI("https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP")); + unzip(ip2LocationZip, dataPath, List.of("IP2LOCATION-LITE-DB1.CSV", "README_LITE.TXT", "LICENSE-CC-BY-SA-4.0.TXT")); + Files.deleteIfExists(ip2LocationZip); + } + } + + private static void download(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new BufferedInputStream(source.toURL().openStream())) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + private static void downloadGzipped(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new GZIPInputStream(new BufferedInputStream(source.toURL().openStream()))) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + + private static void unzip(Path inputZip, Path outputDir, Collection fileNames) throws IOException { + try (ZipFile zipFile = new ZipFile(inputZip.toFile())) { + zipFile.stream().forEach(entry -> { + try { + if (fileNames.contains(entry.getName())) { + System.out.println("Extracting " + entry.getName()); + Files.copy(zipFile.getInputStream(entry), outputDir.resolve(entry.getName())); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } } diff --git a/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java new file mode 100644 index 00000000..aca73872 --- /dev/null +++ b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.control; + +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; + +class ControlMainTest { + + @Test + @Disabled("We don't want to rudely hammer 3rd party services with chonky downloads on every build") + void downloadAncillaryFiles() throws Exception { + Path tempDir = Files.createTempDirectory("test"); + + ControlMain.downloadAncillaryFiles(tempDir); + + Assertions.assertTrue(Files.exists(tempDir.resolve("adblock.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("suggestions.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-data-raw-table"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-used-autnums"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("LICENSE-CC-BY-SA-4.0.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("README_LITE.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV"))); + + // We don't want to leave a mess + Assertions.assertFalse(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"))); + + TestUtil.clearTempDir(tempDir); + } +} \ No newline at end of file diff --git a/run/setup.sh b/run/setup.sh index 1e9fc1b4..3517411a 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -72,16 +72,4 @@ download_model model/segments.bin https://downloads.marginalia.nu/model/segments download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569 -download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP -unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP - -download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table -download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums - -download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt -if [ ! -f data/suggestions.txt ]; then - download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz - gunzip data/suggestions.txt.gz -fi - popd