Merge pull request #101 from MarginaliaSearch/security-scan

Address security scan findings
2025-02-23 21:18:58 +00:00 · 2024-06-17 13:18:36 +02:00 · 2024-06-17 13:18:36 +02:00 · 69f88255e9
commit 69f88255e9
parent 67703e2274 08ff79827e
9 changed files with 39 additions and 18 deletions
--- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
+++ b/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java
@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.Connection;
 import java.sql.DriverManager;
@ -24,6 +25,10 @@ public class AnchorTagsImpl implements AnchorTagsSource {

        logger.info("Loading atags from " + atagsPath);

+        if (!Files.exists(atagsPath)) {
+            throw new IllegalArgumentException("atags file does not exist: " + atagsPath);
+        }
+
        try (var stmt = duckdbConnection.createStatement()) {
            // Insert the domains into a temporary table, then use that to filter the atags table

@ -35,13 +40,18 @@ public class AnchorTagsImpl implements AnchorTagsSource {
                }
            }

-            // Project the atags table down to only the relevant domains.  This looks like an SQL injection
-            // vulnerability if you're a validation tool, but the string comes from a trusted source.
+            // This is a SQL injection vulnerability if you're a validation tool, but the string comes from a trusted source
+            // -- we validate nonetheless to present a better error message
+            String path = atagsPath.toAbsolutePath().toString();
+            if (path.contains("'")) {
+                throw new IllegalArgumentException("atags file path contains a single quote: " + path + " and would break the query.");
+            }
+
            stmt.executeUpdate("""
                create table atags as 
                    select * from '%s'  
                    where dest in (select * from domains)
-                """.formatted(atagsPath.toAbsolutePath()));
+                """.formatted(path));

            // Free up the memory used by the domains table
            stmt.executeUpdate("drop table domains");
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java
@ -32,7 +32,7 @@ public class NoSecuritySSL {
    @SneakyThrows
    public static SSLSocketFactory buildSocketFactory() {
        // Install the all-trusting trust manager
-        final SSLContext sslContext = SSLContext.getInstance("SSL");
+        final SSLContext sslContext = SSLContext.getInstance("TLS");
        sslContext.init(null, trustAllCerts, new java.security.SecureRandom());

        var clientSessionContext = sslContext.getClientSessionContext();
--- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
+++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java
@ -8,7 +8,7 @@ import java.security.NoSuchAlgorithmException;
 class WarcDigestBuilder {
    private final MessageDigest digest;

-    private static final String digestAlgorithm = "SHA-1";
+    private static final String digestAlgorithm = "SHA-256";

    public WarcDigestBuilder() throws NoSuchAlgorithmException {
        this.digest = MessageDigest.getInstance(digestAlgorithm);
--- a/code/services-application/search-service/resources/static/search/tts.js
+++ b/code/services-application/search-service/resources/static/search/tts.js
@ -27,7 +27,7 @@ function setupTypeahead() {

                for (i=0;i<items.length;i++) {
                    item = document.createElement('a');
-                    item.innerHTML=items[i];
+                    item.textContent=items[i];
                    item.setAttribute('href', '#')

                    function suggestionClickHandler(e) {
--- a/run/install.sh
+++ b/run/install.sh
@ -184,13 +184,16 @@ A working setup needs at all the services
 * index [ http port is internal ]
 * executor [ http port is internal ]

-The index and executor services should be on the same partition e.g. index:1 and executor:1,
-which should be a number larger than 0.  You can have multiple pairs of index and executor partitions,
-but the pair should run on the same physical machine with the same install directory.
+Since you will need to manage ports yourself, you must assign distinct ports-pairs to each service.

-The query service can use any partition number.
+* An index and executor services should exist on the same partition e.g. index:1 and executor:1. The partition
+number is the last digit of the service name, and should be positive.  You can have multiple pairs of index
+and executor partitions, but the pair should run on the same physical machine with the same install directory.
+
+* The query service can use any partition number.
+
+* The control service should be on partition 1.

-The control service should be on partition 1.
 EOF

 echo
--- a/run/readme.md
+++ b/run/readme.md
@ -3,11 +3,11 @@
 This directory is a staging area for running the system.  It contains scripts
 and templates for installing the system on a server, and for running it locally.

-See [https://docs.marginalia.nu/](https://docs.marginalia.nu/) for additional
-documentation.
-
 ## Requirements

+**x86-64 Linux** - The system is only tested on x86-64 Linux.  It may work on other
+platforms, but for lack of suitable hardware, this can not be guaranteed.
+
 **Docker** - It is a bit of a pain to install, but if you follow
 [this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems.

@ -15,7 +15,12 @@ documentation.
 The civilized way of installing this is to use [SDKMAN](https://sdkman.io/);
 graalce is a good distribution choice but it doesn't matter too much.

-## Set up
+## Quick Set up
+
+[https://docs.marginalia.nu/](https://docs.marginalia.nu/) has a more comprehensive guide for the install
+and operation of the search engine.  This is a quick guide for the impatient.
+
+---

 To go from a clean check out of the git repo to a running search engine,
 follow these steps. 
@ -51,6 +56,8 @@ you for which installation mode you want to use.  The options are:
 2. Full Marginalia Search instance - This will install an instance of the search engine
   configured like [search.marginalia.nu](https://search.marginalia.nu).  This is useful
   for local development and testing.
+3. Non-docker installation - This will install the system outside of docker. 
+   This is still an experimental run-mode.

 It will also prompt you for account details for a new mariadb instance, which will be
 created for you.  The database will be initialized with the schema and data required
--- a/settings.gradle
+++ b/settings.gradle
@ -208,8 +208,8 @@ dependencyResolutionManagement {
            library('sqlite','org.xerial','sqlite-jdbc').version('3.41.2.2')
            library('javax.annotation','javax.annotation','javax.annotation-api').version('1.3.2')

-            library('parquet-column', 'org.apache.parquet','parquet-column').version('1.13.1')
-            library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.13.1')
+            library('parquet-column', 'org.apache.parquet','parquet-column').version('1.14.0')
+            library('parquet-hadoop', 'org.apache.parquet','parquet-hadoop').version('1.14.0')

            library('curator-framework', 'org.apache.curator','curator-framework').version('5.6.0')
            library('curator-x-discovery', 'org.apache.curator','curator-x-discovery').version('5.6.0')
--- a/third-party/parquet-floor/build.gradle
+++ b/third-party/parquet-floor/build.gradle
@ -9,7 +9,7 @@ java {
 }

 dependencies {
-    implementation ('org.apache.parquet:parquet-column:1.13.1') {
+    implementation ('org.apache.parquet:parquet-column:1.14.0') {
        transitive = true
    }
    implementation('org.apache.parquet:parquet-hadoop:1.13.1') {
--- a/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java
+++ b/third-party/parquet-floor/src/main/java/org/apache/hadoop/conf/Configuration.java
@ -1,6 +1,7 @@
 package org.apache.hadoop.conf;

 public class Configuration {
+    public Configuration(boolean x) {}

    public boolean getBoolean(String x, boolean y) {
        return y;