priv/launcher/start.c

/*
 * start.c — shell-free launcher for a `mix release` on distroless (no /bin/sh).
 *
 * Part of the `shellless_release` library. The consuming app's mix release step
 * compiles this with -DRELEASE_NAME / -DDIST_PORT and a generated
 * shellless_tasks.h (the verb whitelist), then installs the binary over the
 * release's bin/<name> entry points.
 *
 * Replaces the chain  bin/<name> -> bin/<release> -> releases/<v>/elixir
 * -> erts-<e>/bin/erl  (all #!/bin/sh scripts) with a single native binary that
 * does what they collectively do for `start`, then execve()s the BEAM. It also
 * pre-starts epmd so that erlexec never reaches its
 *   system("<bindir>/epmd -daemon")
 * call (system(3) needs /bin/sh, which distroless :nonroot doesn't have).
 *
 * Commands are a fixed WHITELIST — `server` (default) plus the app's audited
 * task verbs (TASKS[], from the generated header). The generic `eval EXPR` /
 * `rpc` / `remote` RCE surface of the stock release CLI is deliberately NOT
 * implemented: no command accepts a caller-supplied Elixir expression, so
 * controlling the container's args cannot run arbitrary code.
 *
 * Multi-call binary (busybox-style): the same binary is installed at every
 * bin/<name> entry point and dispatches on basename(argv[0]). This preserves
 * the original mix-release interface exactly, so existing deploy manifests keep
 * invoking the same /app/bin/<name> paths unchanged. It also accepts the
 * subcommand form (/app/<launcher> <verb>) when invoked under any other name.
 *
 * Distribution is hardened: a pinned dist port (DIST_PORT, no ephemeral
 * fallback), mandatory mutual-TLS distribution (refuses to boot without the
 * mounted cert bundle), and a fail-closed cookie strength check.
 *
 * Version-independent: the ERTS and release versions are read from
 * releases/start_erl.data, and the release root is resolved from argv[0], so an
 * OTP/app/version bump needs no rebuild of this binary.
 *
 * Dynamic glibc link is fine: distroless/cc already ships it, same as beam.smp.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>

/* Release root inside the image. Matches WORKDIR /app + COPY target. */
#ifndef RELEASE_ROOT
#define RELEASE_ROOT "/app"
#endif

/* Set at build time via -DRELEASE_NAME=... by the mix release step (from the
 * Mix.Release struct, so it can't drift from the release name). The default is
 * only a placeholder for standalone syntax checks. */
#ifndef RELEASE_NAME
#define RELEASE_NAME "release"
#endif

/*
 * Pinned Erlang distribution port. Build-time constant (override per build with
 * -DDIST_PORT=<n>) because it is a deploy-wide fact, identical across every pod
 * — not a per-instance runtime setting. Pinning a single port (vs the BEAM's
 * default ephemeral port) is what makes the dist channel firewallable with a
 * NetworkPolicy. The launcher always emits min==max so there is NO ephemeral
 * fallback path.
 *
 * 24369: below the Linux ephemeral range (32768+, avoids kernel port-reuse
 * races), unregistered in IANA/known exporters (steers clear of 9090/9100/9200
 * Prometheus & friends), and self-documenting — "4369-flavoured" reads as
 * Erlang distribution to anyone debugging (epmd is 4369).
 */
#ifndef DIST_PORT
#define DIST_PORT 24369
#endif

/* stringify helpers so DIST_PORT (an int macro) can be passed as an argv string. */
#define STR2(x) #x
#define STR(x) STR2(x)

static void die(const char *what) {
    fprintf(stderr, "start: %s: %s\n", what, strerror(errno));
    exit(1);
}

/*
 * Resolve the release root. The binary lives at <root>/bin/<name>, so the root
 * is two directories up from the resolved executable path. This makes the
 * launcher location-independent (works whether the release sits at /app or in
 * _build/... locally). Falls back to the compile-time RELEASE_ROOT default if
 * argv[0] can't be resolved (e.g. invoked via a bare name with no path info).
 */
static void resolve_root(const char *argv0, char *root, size_t root_n) {
    char buf[2048];
    /* realpath of argv0 -> <root>/bin/<name> */
    if (argv0 && realpath(argv0, buf)) {
        /* strip /<name> */
        char *slash = strrchr(buf, '/');
        if (slash) *slash = '\0';
        /* strip /bin */
        slash = strrchr(buf, '/');
        if (slash) {
            *slash = '\0';
            snprintf(root, root_n, "%s", buf);
            return;
        }
    }
    snprintf(root, root_n, "%s", RELEASE_ROOT);
}

/* Read "<erts_vsn> <release_vsn>" from <root>/releases/start_erl.data. */
static void read_versions(const char *root,
                          char *erts, size_t erts_n, char *vsn, size_t vsn_n) {
    char path[2048];
    snprintf(path, sizeof path, "%s/releases/start_erl.data", root);
    FILE *f = fopen(path, "r");
    if (!f) die("open start_erl.data");

    char e[128] = {0}, v[128] = {0};
    if (fscanf(f, "%127s %127s", e, v) != 2) {
        fprintf(stderr, "start: could not parse %s\n", path);
        exit(1);
    }
    fclose(f);
    snprintf(erts, erts_n, "%s", e);
    snprintf(vsn, vsn_n, "%s", v);
}

/* setenv() that aborts on failure (out of memory only). */
static void set(const char *k, const char *v) {
    if (setenv(k, v, 1) != 0) die("setenv");
}

/* setenv only if the var is not already provided by the pod (K8s overrides). */
static void set_default(const char *k, const char *v) {
    if (!getenv(k) && setenv(k, v, 1) != 0) die("setenv");
}

#ifndef EPMDLESS
/*
 * Pre-start epmd as a detached daemon via a native fork+exec (NO shell).
 * epmd self-daemonises with -daemon and exits the foreground process once the
 * listener is up, so we wait for that child to reap it. After this returns,
 * epmd is listening on ERL_EPMD_PORT (default 4369) and erlexec, seeing it
 * already up under -start_epmd false, will not call system().
 *
 * If epmd is already running (e.g. restarted container, shared netns), the
 * second daemon just exits 0 — harmless.
 *
 * Compiled out entirely in EPMD-less mode (there is no epmd).
 */
static void start_epmd(const char *bindir) {
    char epmd[512];
    snprintf(epmd, sizeof epmd, "%s/epmd", bindir);

    struct stat st;
    if (stat(epmd, &st) != 0) {
        /* No epmd in the bundled ERTS (unusual) — let erlexec deal with it. */
        fprintf(stderr, "start: warning: %s not found, skipping pre-start\n", epmd);
        return;
    }

    pid_t pid = fork();
    if (pid < 0) die("fork epmd");

    if (pid == 0) {
        /* child: become epmd -daemon. -relaxed_command_check is the common
         * release setting; omit if you want strict mode. */
        char *argv[] = { epmd, "-daemon", NULL };
        execv(epmd, argv);
        /* only reached on failure */
        fprintf(stderr, "start: execv epmd: %s\n", strerror(errno));
        _exit(127);
    }

    /* parent: reap the daemoniser (it forks the real daemon then exits). */
    int status = 0;
    if (waitpid(pid, &status, 0) < 0) die("waitpid epmd");
    if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
        fprintf(stderr, "start: epmd -daemon exited %d (continuing; "
                        "may already be running)\n", WEXITSTATUS(status));
    }
}
#endif /* !EPMDLESS */

/* Resolved per-boot paths, shared by both subcommands. */
struct paths {
    char root[1024];        /* release root (resolved from argv[0])   */
    char bindir[1100];
    char erlexec[1200];
    char start_boot[1200];  /* releases/<v>/start       (server)     */
    char clean_boot[1200];  /* releases/<v>/start_clean (eval)       */
    char cfg[1200];         /* releases/<v>/sys                       */
    char args[1200];        /* releases/<v>/vm.args                   */
    char lib[1100];         /* <root>/lib                             */
    char tmp[1100];         /* <root>/tmp                             */
    char vsn[128];
};

/* Read the COOKIE (RELEASE_COOKIE env, else <root>/releases/COOKIE). Required
 * only for the distributed `start`; eval runs non-distributed and ignores it. */
static int read_cookie(const struct paths *p, char *cookie, size_t n) {
    const char *env_cookie = getenv("RELEASE_COOKIE");
    if (env_cookie && *env_cookie) {
        snprintf(cookie, n, "%s", env_cookie);
        return 1;
    }
    char cpath[1200];
    snprintf(cpath, sizeof cpath, "%s/releases/COOKIE", p->root);
    FILE *cf = fopen(cpath, "r");
    if (cf) {
        /* n is a small fixed buffer (caller passes sizeof cookie); cap to INT_MAX
         * defensively so the size_t->int narrowing fgets() requires is explicit. */
        int cap = n > (size_t)INT_MAX ? INT_MAX : (int)n;
        if (fgets(cookie, cap, cf)) cookie[strcspn(cookie, "\r\n")] = '\0';
        fclose(cf);
    }
    return cookie[0] != '\0';
}

/* Env every invocation needs: what the erts `erl` script exports for erlexec,
 * plus what the config provider reads. Distribution/PHX_SERVER are set per
 * subcommand by the caller. */
static void set_common_env(const struct paths *p) {
    set("EMU", "beam");
    set("ROOTDIR", p->root);
    set("BINDIR", p->bindir);
    set("PROGNAME", "erl");

    set("RELEASE_ROOT", p->root);
    set("RELEASE_NAME", RELEASE_NAME);
    set("RELEASE_VSN", p->vsn);
    set("RELEASE_SYS_CONFIG", p->cfg);
    set_default("RELEASE_MODE", "embedded");
    set_default("RELEASE_TMP", p->tmp);
}

/*
 * Validate the distribution cookie. The cookie is the shared secret guarding
 * distributed Erlang — a remote-code-execution channel. We fail CLOSED on a
 * missing or obviously-weak cookie rather than silently booting an insecure
 * node. (TLS distribution, when enabled, is the primary defence; this is the
 * belt to that's suspenders.)
 *
 * "Weak" heuristics: empty, the literal release name, or shorter than 32 chars.
 * A real cookie should be a high-entropy 32+ char secret from a K8s Secret.
 * Override the minimum with DIST_COOKIE_MIN_LEN, or set DIST_ALLOW_WEAK_COOKIE=1
 * to bypass (intended only for local/CI, never production).
 */
static int cookie_is_acceptable(const char *cookie) {
    if (getenv("DIST_ALLOW_WEAK_COOKIE")) return 1;

    size_t len = strlen(cookie);
    size_t min = 32;
    const char *env_min = getenv("DIST_COOKIE_MIN_LEN");
    if (env_min && *env_min) {
        long v = strtol(env_min, NULL, 10);
        if (v > 0) min = (size_t)v;
    }

    if (len == 0) {
        fprintf(stderr, "start: refusing to boot: distribution cookie is empty\n");
        return 0;
    }
    if (len < min) {
        fprintf(stderr, "start: refusing to boot: distribution cookie too short "
                        "(%zu < %zu). Use a high-entropy RELEASE_COOKIE from a "
                        "Secret, or set DIST_ALLOW_WEAK_COOKIE=1 for local/CI.\n",
                len, min);
        return 0;
    }
    if (strcmp(cookie, RELEASE_NAME) == 0) {
        fprintf(stderr, "start: refusing to boot: cookie equals the release "
                        "name (predictable)\n");
        return 0;
    }
    return 1;
}

/* `start`: the long-running, distributed Phoenix server. Mirrors
 * rel/overlays/bin/server + rel/env.sh.eex + the frozen `start` argv. */
static int run_start(const struct paths *p) {
    set("RELEASE_BOOT_SCRIPT", "start");

    /* Distribution (rel/env.sh.eex). POD_IP from the K8s downward API. */
    set("RELEASE_DISTRIBUTION", "name");
    const char *pod_ip = getenv("POD_IP");
    char node[256];
    if (pod_ip && *pod_ip) {
        snprintf(node, sizeof node, RELEASE_NAME "@%s", pod_ip);
    } else {
        fprintf(stderr, "start: warning: POD_IP unset, using 127.0.0.1\n");
        snprintf(node, sizeof node, RELEASE_NAME "@127.0.0.1");
    }
    set("RELEASE_NODE", node);

    /* Export the pinned dist port so the EpmdLess module (and anything else)
     * resolves to the SAME port the VM listens on. */
    set_default("DIST_PORT", STR(DIST_PORT));

    /* Phoenix: start the HTTP endpoint (runtime.exs gates on PHX_SERVER). */
    set("PHX_SERVER", "true");

    char cookie[256] = {0};
    if (!read_cookie(p, cookie, sizeof cookie)) {
        fprintf(stderr, "start: no RELEASE_COOKIE and no releases/COOKIE\n");
        return 1;
    }
    if (!cookie_is_acceptable(cookie)) {
        return 1;
    }

    /*
     * TLS distribution. When REQUIRE_TLS is set (the default; the mix step
     * passes -DREQUIRE_TLS=1 unless require_tls: false), the launcher ALWAYS
     * runs mutual-TLS distribution and refuses to boot if the cert bundle is
     * missing — there is no runtime switch to forget. This makes a stolen
     * cookie insufficient to join the cluster: a peer must also present a cert
     * signed by our dist CA. Certs are mounted read-only at <root>/dist-certs
     * (a K8s Secret); the SSL options live in <root>/dist/inet_tls.config.
     *
     * With require_tls: false (non-clustered apps), distribution is cookie-only.
     */
    char optfile[1300];
    snprintf(optfile, sizeof optfile, "%s/dist/inet_tls.config", p->root);

#ifdef REQUIRE_TLS
    if (access(optfile, R_OK) != 0) {
        fprintf(stderr, "start: refusing to boot: TLS dist options file %s is "
                        "unreadable: %s\n", optfile, strerror(errno));
        return 1;
    }
    static const char *const cert_files[] = {
        "dist-certs/dist-cert.pem",
        "dist-certs/dist-key.pem",
        "dist-certs/dist-ca.pem",
    };
    for (size_t i = 0; i < sizeof cert_files / sizeof cert_files[0]; i++) {
        char path[1300];
        snprintf(path, sizeof path, "%s/%s", p->root, cert_files[i]);
        if (access(path, R_OK) != 0) {
            fprintf(stderr, "start: refusing to boot: distribution cert %s is "
                            "unreadable: %s\n", path, strerror(errno));
            return 1;
        }
    }
#endif

#ifndef EPMDLESS
    /* Pre-start epmd natively so erlexec never shells out for it.
     * (Not needed in EPMD-less mode — there is no epmd at all.) */
    start_epmd(p->bindir);
#endif

    /*
     * Build the `start` argv:
     *   - frozen launcher argv (mode/boot/config/args)
     *   - cookie + -name (distribution identity)
     *   - "-start_epmd false" (epmd already up; erlexec must NOT system())
     *   - "-kernel inet_dist_listen_min/max DIST_PORT" (pin the dist port so it
     *     is firewallable; min==max => exactly one port, no ephemeral fallback)
     *   - "-proto_dist inet_tls -ssl_dist_optfile <optfile>" (mTLS; REQUIRE_TLS)
     */
    char *argv[40];
    int n = 0;
    argv[n++] = (char *)p->erlexec;
    argv[n++] = "-noshell";
    argv[n++] = "-s"; argv[n++] = "elixir"; argv[n++] = "start_cli";
    argv[n++] = "-mode"; argv[n++] = "embedded";
    argv[n++] = "-setcookie"; argv[n++] = cookie;
    argv[n++] = "-name"; argv[n++] = node;
    argv[n++] = "-start_epmd"; argv[n++] = "false";
#ifdef EPMDLESS
    /* EPMD-less: resolve every node's dist port to a constant via our module,
     * so no epmd (and no port 4369) is needed at all. The module reads the same
     * DIST_PORT we export below, so listener and resolver always agree. */
    argv[n++] = "-epmd_module"; argv[n++] = "Elixir.ShelllessRelease.EpmdLess";
#endif
    argv[n++] = "-kernel";
    argv[n++] = "inet_dist_listen_min"; argv[n++] = STR(DIST_PORT);
    argv[n++] = "inet_dist_listen_max"; argv[n++] = STR(DIST_PORT);
#ifdef REQUIRE_TLS
    argv[n++] = "-proto_dist"; argv[n++] = "inet_tls";
    argv[n++] = "-ssl_dist_optfile"; argv[n++] = optfile;
#endif
    argv[n++] = "-config"; argv[n++] = (char *)p->cfg;
    argv[n++] = "-boot"; argv[n++] = (char *)p->start_boot;
    argv[n++] = "-boot_var"; argv[n++] = "RELEASE_LIB"; argv[n++] = (char *)p->lib;
    argv[n++] = "-args_file"; argv[n++] = (char *)p->args;
    argv[n++] = "-extra"; argv[n++] = "--no-halt";
    argv[n] = NULL;

    execv(p->erlexec, argv);
    die("execv erlexec (start)");
    return 127;
}

/*
 * run_task: run ONE compiled-in expression on a fresh, non-booted,
 * non-distributed node, then halt. This is the locked-down replacement for the
 * generic `eval EXPR`.
 *
 * SECURITY: `expr` is NEVER taken from argv. It comes only from the fixed TASKS
 * table below, so there is no code path that feeds an attacker-controlled
 * string to the BEAM. The verbs are a capability whitelist: even with full
 * control of the container's command/args, the only reachable behaviours are
 * `server` and the audited migration tasks. (The stock release `eval`/`rpc`/
 * `remote` RCE surface is simply not implemented.)
 *
 * Non-distributed => NO epmd and NO /bin/sh involved; boot script is
 * start_clean, no -mode/-name/-setcookie, ends with `-extra --eval EXPR --`.
 */
static int run_task(const struct paths *p, const char *expr) {
    set("RELEASE_BOOT_SCRIPT", "start_clean");
    /* non-distributed; make sure runtime.exs doesn't start the server. */
    set("RELEASE_DISTRIBUTION", "none");
    unsetenv("PHX_SERVER");

    char *argv[] = {
        (char *)p->erlexec,
        "-noshell",
        "-s", "elixir", "start_cli",
        "-config", (char *)p->cfg,
        "-boot", (char *)p->clean_boot,
        "-boot_var", "RELEASE_LIB", (char *)p->lib,
        "-args_file", (char *)p->args,
        "-extra", "--eval", (char *)expr, "--",
        NULL
    };
    execv(p->erlexec, argv);
    die("execv erlexec (task)");
    return 127;
}

/*
 * The whitelist. Each verb maps to exactly one compiled-in Elixir expression.
 * To add an operation you change config and rebuild — it cannot be injected at
 * runtime.
 *
 * The table is GENERATED from the consuming app's config by the mix release
 * step (ShelllessRelease) into shellless_tasks.h, then #included here. This
 * keeps the whitelist a compile-time constant (the security property) while
 * letting each app declare its own verbs without forking this source.
 *
 * The generated header must define TASKS_INIT as a brace-enclosed initialiser
 * list of {verb, expr} pairs, e.g.:
 *     #define TASKS_INIT \
 *       { "migrate", "MyApp.Release.migrate()" }, \
 *       { "seed",    "MyApp.Release.seed()" },
 * If no header is provided (e.g. compiling the source standalone for a syntax
 * check), TASKS_INIT defaults to empty — server-only, no task verbs.
 *
 * Tasks that need caller-supplied args (e.g. rollback(repo, version)) are
 * intentionally unsupported: a zero-arg fixed verb cannot take an argument, so
 * there is no path to smuggle an expression in.
 */
#if defined(__has_include)
#  if __has_include("shellless_tasks.h")
#    include "shellless_tasks.h"
#  endif
#endif

#ifndef TASKS_INIT
#define TASKS_INIT
#endif

struct task { const char *verb; const char *expr; };
static const struct task TASKS[] = { TASKS_INIT };
static const int N_TASKS = (int)(sizeof TASKS / sizeof TASKS[0]);

/* True if `name` is one of the installed entry-point names (used to recognise
 * argv[0] when invoked as /app/bin/<name>). "server"/"start" alias the server. */
static int is_known_command(const char *name) {
    if (strcmp(name, "server") == 0 || strcmp(name, "start") == 0) return 1;
    for (int i = 0; i < N_TASKS; i++)
        if (strcmp(name, TASKS[i].verb) == 0) return 1;
    return 0;
}

int main(int argc, char **argv) {
    char erts[128];
    struct paths p;

    /* Resolve the release root from our own location (<root>/bin/<name>), so
     * the launcher works wherever the release is unpacked, not just /app. */
    resolve_root(argv[0], p.root, sizeof p.root);
    read_versions(p.root, erts, sizeof erts, p.vsn, sizeof p.vsn);

    snprintf(p.bindir,     sizeof p.bindir,     "%s/erts-%s/bin", p.root, erts);
    snprintf(p.erlexec,    sizeof p.erlexec,    "%s/erlexec", p.bindir);
    snprintf(p.start_boot, sizeof p.start_boot, "%s/releases/%s/start", p.root, p.vsn);
    snprintf(p.clean_boot, sizeof p.clean_boot, "%s/releases/%s/start_clean", p.root, p.vsn);
    snprintf(p.cfg,        sizeof p.cfg,        "%s/releases/%s/sys", p.root, p.vsn);
    snprintf(p.args,       sizeof p.args,       "%s/releases/%s/vm.args", p.root, p.vsn);
    snprintf(p.lib,        sizeof p.lib,        "%s/lib", p.root);
    snprintf(p.tmp,        sizeof p.tmp,        "%s/tmp", p.root);

    set_common_env(&p);

    /*
     * Command resolution preserves the ORIGINAL interface. This is a multi-call
     * binary (busybox-style): it is installed at /app/bin/server,
     * /app/bin/migrate, /app/bin/create_import_migrate and
     * /app/bin/setup_coaching_request_lines (hardlinks to one inode), so the
     * deploy repo keeps invoking the exact same paths it always has — no
     * manifest changes needed.
     *
     * Resolution order:
     *   1. basename(argv[0]) — when invoked as /app/bin/<name> (the real,
     *      unchanged interface). "start"/"server" both mean the server.
     *   2. else argv[1] — the subcommand form /app/start <verb>, handy for a
     *      single CMD and for local use.
     *
     * Either way the command must be in the whitelist (server | TASKS) and
     * takes ZERO arguments; no caller-supplied expression is ever accepted.
     */
    const char *base = argv[0] ? argv[0] : "start";
    const char *slash = strrchr(base, '/');
    if (slash) base = slash + 1;

    const char *cmd;
    int extra_args;
    if (is_known_command(base)) {
        /* invoked as /app/bin/<name> */
        cmd = base;
        extra_args = (argc > 1);
    } else {
        /* invoked as a launcher with a subcommand (default: server) */
        cmd = (argc > 1) ? argv[1] : "server";
        extra_args = (argc > 2);
    }

    if (strcmp(cmd, "server") == 0 || strcmp(cmd, "start") == 0) {
        if (extra_args) { fprintf(stderr, "start: '%s' takes no arguments\n", cmd); return 1; }
        return run_start(&p);
    }
    for (int i = 0; i < N_TASKS; i++) {
        if (strcmp(cmd, TASKS[i].verb) == 0) {
            if (extra_args) { fprintf(stderr, "start: '%s' takes no arguments\n", cmd); return 1; }
            return run_task(&p, TASKS[i].expr);
        }
    }

    /* Unknown command: reject and print the whitelist. */
    fprintf(stderr, "start: unknown command '%s'\n", cmd);
    fprintf(stderr, "allowed: server (default)");
    for (int i = 0; i < N_TASKS; i++) fprintf(stderr, " | %s", TASKS[i].verb);
    fprintf(stderr, "\n");
    return 1;
}