diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index 91fe49d4..4a10951e 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -75,6 +75,10 @@ public class WmsaHome { return ret; } + public static Path getDataPath() { + return getHomePath().resolve("data"); + } + public static Path getAdsDefinition() { return getHomePath().resolve("data").resolve("adblock.txt"); } diff --git a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java index ba10c54e..d826a24b 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java +++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java @@ -3,22 +3,33 @@ package nu.marginalia.control; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.WmsaHome; import nu.marginalia.service.MainClass; -import nu.marginalia.service.discovery.ServiceRegistryIf; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; -import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.server.Initialization; +import java.io.BufferedInputStream; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Collection; +import java.util.List; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipFile; + public class ControlMain extends MainClass { @Inject public ControlMain(ControlService service) { } - public static void main(String... args) { + public static void main(String... args) throws Exception { init(ServiceId.Control, args); Injector injector = Guice.createInjector( @@ -30,9 +41,97 @@ public class ControlMain extends MainClass { // Orchestrate the boot order for the services var registry = injector.getInstance(ServiceRegistryIf.class); var configuration = injector.getInstance(ServiceConfiguration.class); + + // This must be run before orchestrateBoot, so that the other services don't + // start up until we're done + downloadAncillaryFiles(WmsaHome.getDataPath()); + orchestrateBoot(registry, configuration); + injector.getInstance(ControlMain.class); injector.getInstance(Initialization.class).setReady(); } + + static void downloadAncillaryFiles(Path dataPath) throws Exception { + Path adblockFile = dataPath.resolve("adblock.txt"); + if (!Files.exists(adblockFile)) { + download(adblockFile, new URI("https://downloads.marginalia.nu/data/adblock.txt")); + } + + Path suggestionsFile = dataPath.resolve("suggestions.txt"); + if (!Files.exists(suggestionsFile)) { + downloadGzipped(suggestionsFile, new URI("https://downloads.marginalia.nu/data/suggestions.txt.gz")); + } + + Path asnRawData = dataPath.resolve("asn-data-raw-table"); + if (!Files.exists(asnRawData)) { + download(asnRawData, new URI("https://thyme.apnic.net/current/data-raw-table")); + } + + Path asnUsedAutnums = dataPath.resolve("asn-used-autnums"); + if (!Files.exists(asnUsedAutnums)) { + download(asnUsedAutnums, new URI("https://thyme.apnic.net/current/data-used-autnums")); + } + + Path ip2Location = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV"); + Path ip2LocationZip = dataPath.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"); + + if (!Files.exists(ip2Location)) { + if (Files.exists(ip2LocationZip)) { + Files.delete(ip2LocationZip); + } + + download(ip2LocationZip, new URI("https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP")); + unzip(ip2LocationZip, dataPath, List.of("IP2LOCATION-LITE-DB1.CSV", "README_LITE.TXT", "LICENSE-CC-BY-SA-4.0.TXT")); + Files.deleteIfExists(ip2LocationZip); + } + } + + private static void download(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new BufferedInputStream(source.toURL().openStream())) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + private static void downloadGzipped(Path dest, URI source) throws IOException { + System.out.println("Downloading " + source + " to " + dest); + try { + if (!Files.exists(dest)) { + try (var in = new GZIPInputStream(new BufferedInputStream(source.toURL().openStream()))) { + Files.copy(in, dest); + } + } + } + catch (IOException e) { + Files.deleteIfExists(dest); + throw e; + } + } + + + private static void unzip(Path inputZip, Path outputDir, Collection fileNames) throws IOException { + try (ZipFile zipFile = new ZipFile(inputZip.toFile())) { + zipFile.stream().forEach(entry -> { + try { + if (fileNames.contains(entry.getName())) { + System.out.println("Extracting " + entry.getName()); + Files.copy(zipFile.getInputStream(entry), outputDir.resolve(entry.getName())); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } } diff --git a/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java new file mode 100644 index 00000000..aca73872 --- /dev/null +++ b/code/services-core/control-service/test/nu/marginalia/control/ControlMainTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.control; + +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; + +import java.nio.file.Files; +import java.nio.file.Path; + +class ControlMainTest { + + @Test + @Disabled("We don't want to rudely hammer 3rd party services with chonky downloads on every build") + void downloadAncillaryFiles() throws Exception { + Path tempDir = Files.createTempDirectory("test"); + + ControlMain.downloadAncillaryFiles(tempDir); + + Assertions.assertTrue(Files.exists(tempDir.resolve("adblock.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("suggestions.txt"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-data-raw-table"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("asn-used-autnums"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("LICENSE-CC-BY-SA-4.0.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("README_LITE.TXT"))); + Assertions.assertTrue(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV"))); + + // We don't want to leave a mess + Assertions.assertFalse(Files.exists(tempDir.resolve("IP2LOCATION-LITE-DB1.CSV.ZIP"))); + + TestUtil.clearTempDir(tempDir); + } +} \ No newline at end of file diff --git a/run/setup.sh b/run/setup.sh index 1e9fc1b4..3517411a 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -72,16 +72,4 @@ download_model model/segments.bin https://downloads.marginalia.nu/model/segments download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin a38f0809f983723001dfc784d88ebb6d download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz 340156704bb8c8e50c4abf35a7ec2569 -download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP -unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP - -download_model data/asn-data-raw-table https://thyme.apnic.net/current/data-raw-table -download_model data/asn-used-autnums https://thyme.apnic.net/current/data-used-autnums - -download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt -if [ ! -f data/suggestions.txt ]; then - download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz - gunzip data/suggestions.txt.gz -fi - popd