diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 2fecceae..b2115fb0 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -170,6 +170,7 @@ task e2eTest(type: Test) { dependsOn 'downloadSentenceModelData' dependsOn 'downloadTokenModelData' dependsOn 'downloadTermFreqData' + dependsOn 'IP2LocationFile' classpath = sourceSets.e2eTest.runtimeClasspath testClassesDirs = sourceSets.e2eTest.output.classesDirs @@ -201,10 +202,23 @@ task downloadTokenModelData(type: Download) { dest file('data/models/opennlp-tokens.bin') overwrite false } +task downloadIP2LocationFile(type: Download) { + src 'https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP' + dest file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP') + overwrite false +} +task IP2LocationFile(type: Copy) { + dependsOn 'downloadIP2LocationFile' + def zipFile = file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP') + def outputDir = file("data/models/IP2LOC") + + from zipTree(zipFile) + into outputDir +} task downloadTermFreqData(type: Copy) { // TODO: Need hosting for this file from '/var/lib/wmsa/model/tfreq-new-algo3.bin' - into 'data/models' + into 'data/models/' } diff --git a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java index b5bf4c45..8c68b272 100644 --- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java +++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java @@ -72,6 +72,7 @@ public class EdgeSearchE2ETest { .withNetwork(network) .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) + .withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV") .withCopyFileToContainer(jarFile(), "/WMSA.jar") .withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh") .withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE) @@ -127,6 +128,14 @@ public class EdgeSearchE2ETest { } return modelsPath.toString(); } + public static MountableFile ipDatabasePath() { + Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV"); + if (!Files.isRegularFile(modelsPath)) { + System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath()); + throw new RuntimeException(); + } + return MountableFile.forHostPath(modelsPath.toString()); + } private Path getCrawlPath() { return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); diff --git a/marginalia_nu/src/e2e/resources/crawl.sh b/marginalia_nu/src/e2e/resources/crawl.sh index e8fa729a..bf503759 100644 --- a/marginalia_nu/src/e2e/resources/crawl.sh +++ b/marginalia_nu/src/e2e/resources/crawl.sh @@ -1,5 +1,7 @@ #!/bin/bash + mkdir -p /var/lib/wmsa/conf/ +mkdir -p /var/lib/wmsa/data/ cat > /var/lib/wmsa/db.properties < blacklist = Set.of("CN", "HK"); private final Set graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); - private final Cache countryCache = CacheBuilder.newBuilder().maximumSize(100_000).build(); - private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); @AllArgsConstructor @@ -36,10 +36,9 @@ public class GeoIpBlocklist { } public GeoIpBlocklist() throws IOException, CsvValidationException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("IP2LOCATION-LITE-DB1.CSV"), - "Could not load IP location db"); + var resource = WmsaHome.getIPLocationDatabse(); - try (var reader = new CSVReader(new InputStreamReader(resource, StandardCharsets.UTF_8))) { + try (var reader = new CSVReader(new FileReader(resource.toFile()))) { for (;;) { String[] vals = reader.readNext(); if (vals == null) {