Move IP Location database out of classpath and into WMSA_HOME/data

This commit is contained in:
vlofgren 2022-05-27 14:27:44 +02:00
parent 056dec5506
commit 61ef2b06b0
5 changed files with 34 additions and 6 deletions

View File

@ -170,6 +170,7 @@ task e2eTest(type: Test) {
dependsOn 'downloadSentenceModelData' dependsOn 'downloadSentenceModelData'
dependsOn 'downloadTokenModelData' dependsOn 'downloadTokenModelData'
dependsOn 'downloadTermFreqData' dependsOn 'downloadTermFreqData'
dependsOn 'IP2LocationFile'
classpath = sourceSets.e2eTest.runtimeClasspath classpath = sourceSets.e2eTest.runtimeClasspath
testClassesDirs = sourceSets.e2eTest.output.classesDirs testClassesDirs = sourceSets.e2eTest.output.classesDirs
@ -201,10 +202,23 @@ task downloadTokenModelData(type: Download) {
dest file('data/models/opennlp-tokens.bin') dest file('data/models/opennlp-tokens.bin')
overwrite false overwrite false
} }
task downloadIP2LocationFile(type: Download) {
src 'https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP'
dest file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
overwrite false
}
task IP2LocationFile(type: Copy) {
dependsOn 'downloadIP2LocationFile'
def zipFile = file('data/models/IP2LOCATION-LITE-DB1.CSV.ZIP')
def outputDir = file("data/models/IP2LOC")
from zipTree(zipFile)
into outputDir
}
task downloadTermFreqData(type: Copy) { task downloadTermFreqData(type: Copy) {
// TODO: Need hosting for this file // TODO: Need hosting for this file
from '/var/lib/wmsa/model/tfreq-new-algo3.bin' from '/var/lib/wmsa/model/tfreq-new-algo3.bin'
into 'data/models' into 'data/models/'
} }

View File

@ -72,6 +72,7 @@ public class EdgeSearchE2ETest {
.withNetwork(network) .withNetwork(network)
.withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler"))) .withLogConsumer(new Slf4jLogConsumer(LoggerFactory.getLogger("crawler")))
.withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY) .withFileSystemBind(modelsPath(), "/var/lib/wmsa/model", BindMode.READ_ONLY)
.withCopyFileToContainer(ipDatabasePath(), "/var/lib/wmsa/data/IP2LOCATION-LITE-DB1.CSV")
.withCopyFileToContainer(jarFile(), "/WMSA.jar") .withCopyFileToContainer(jarFile(), "/WMSA.jar")
.withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh") .withCopyFileToContainer(MountableFile.forClasspathResource("crawl.sh"), "/crawl.sh")
.withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE) .withFileSystemBind(getCrawlPath().toString(), "/crawl/", BindMode.READ_WRITE)
@ -127,6 +128,14 @@ public class EdgeSearchE2ETest {
} }
return modelsPath.toString(); return modelsPath.toString();
} }
public static MountableFile ipDatabasePath() {
Path modelsPath = Path.of(System.getProperty("user.dir")).resolve("data/models/IP2LOC/IP2LOCATION-LITE-DB1.CSV");
if (!Files.isRegularFile(modelsPath)) {
System.err.println("Could not find models, looked in " + modelsPath.toAbsolutePath());
throw new RuntimeException();
}
return MountableFile.forHostPath(modelsPath.toString());
}
private Path getCrawlPath() { private Path getCrawlPath() {
return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl"); return Path.of(System.getProperty("user.dir")).resolve("build/tmp/crawl");

View File

@ -1,5 +1,7 @@
#!/bin/bash #!/bin/bash
mkdir -p /var/lib/wmsa/conf/ mkdir -p /var/lib/wmsa/conf/
mkdir -p /var/lib/wmsa/data/
cat > /var/lib/wmsa/db.properties <<EOF cat > /var/lib/wmsa/db.properties <<EOF
db.user=wmsa db.user=wmsa

View File

@ -28,4 +28,8 @@ public class WmsaHome {
return new HostsFile(); return new HostsFile();
} }
} }
public static Path getIPLocationDatabse() {
return getHomePath().resolve("data").resolve("IP2LOCATION-LITE-DB1.CSV");
}
} }

View File

@ -6,10 +6,12 @@ import com.google.inject.Singleton;
import com.opencsv.CSVReader; import com.opencsv.CSVReader;
import com.opencsv.exceptions.CsvValidationException; import com.opencsv.exceptions.CsvValidationException;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.net.InetAddress; import java.net.InetAddress;
@ -24,8 +26,6 @@ public class GeoIpBlocklist {
private final Set<String> blacklist = Set.of("CN", "HK"); private final Set<String> blacklist = Set.of("CN", "HK");
private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA"); private final Set<String> graylist = Set.of("RU", "TW", "IN", "ZA", "SG", "UA");
private final Cache<String, String> countryCache = CacheBuilder.newBuilder().maximumSize(100_000).build();
private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class); private static final Logger logger = LoggerFactory.getLogger(GeoIpBlocklist.class);
@AllArgsConstructor @AllArgsConstructor
@ -36,10 +36,9 @@ public class GeoIpBlocklist {
} }
public GeoIpBlocklist() throws IOException, CsvValidationException { public GeoIpBlocklist() throws IOException, CsvValidationException {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("IP2LOCATION-LITE-DB1.CSV"), var resource = WmsaHome.getIPLocationDatabse();
"Could not load IP location db");
try (var reader = new CSVReader(new InputStreamReader(resource, StandardCharsets.UTF_8))) { try (var reader = new CSVReader(new FileReader(resource.toFile()))) {
for (;;) { for (;;) {
String[] vals = reader.readNext(); String[] vals = reader.readNext();
if (vals == null) { if (vals == null) {