/*
* start.c — shell-free launcher for a `mix release` on distroless (no /bin/sh).
*
* Part of the `shellless_release` library. The consuming app's mix release step
* compiles this with -DRELEASE_NAME / -DDIST_PORT and a generated
* shellless_tasks.h (the verb whitelist), then installs the binary over the
* release's bin/<name> entry points.
*
* Replaces the chain bin/<name> -> bin/<release> -> releases/<v>/elixir
* -> erts-<e>/bin/erl (all #!/bin/sh scripts) with a single native binary that
* does what they collectively do for `start`, then execve()s the BEAM. It also
* pre-starts epmd so that erlexec never reaches its
* system("<bindir>/epmd -daemon")
* call (system(3) needs /bin/sh, which distroless :nonroot doesn't have).
*
* Commands are a fixed WHITELIST — `server` (default) plus the app's audited
* task verbs (TASKS[], from the generated header). The generic `eval EXPR` /
* `rpc` / `remote` RCE surface of the stock release CLI is deliberately NOT
* implemented: no command accepts a caller-supplied Elixir expression, so
* controlling the container's args cannot run arbitrary code.
*
* Multi-call binary (busybox-style): the same binary is installed at every
* bin/<name> entry point and dispatches on basename(argv[0]). This preserves
* the original mix-release interface exactly, so existing deploy manifests keep
* invoking the same /app/bin/<name> paths unchanged. It also accepts the
* subcommand form (/app/<launcher> <verb>) when invoked under any other name.
*
* Distribution is hardened: a pinned dist port (DIST_PORT, no ephemeral
* fallback), mandatory mutual-TLS distribution (refuses to boot without the
* mounted cert bundle), and a fail-closed cookie strength check.
*
* Version-independent: the ERTS and release versions are read from
* releases/start_erl.data, and the release root is resolved from argv[0], so an
* OTP/app/version bump needs no rebuild of this binary.
*
* Dynamic glibc link is fine: distroless/cc already ships it, same as beam.smp.
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
/* Release root inside the image. Matches WORKDIR /app + COPY target. */
#ifndef RELEASE_ROOT
#define RELEASE_ROOT "/app"
#endif
/* Set at build time via -DRELEASE_NAME=... by the mix release step (from the
* Mix.Release struct, so it can't drift from the release name). The default is
* only a placeholder for standalone syntax checks. */
#ifndef RELEASE_NAME
#define RELEASE_NAME "release"
#endif
/*
* Pinned Erlang distribution port. Build-time constant (override per build with
* -DDIST_PORT=<n>) because it is a deploy-wide fact, identical across every pod
* — not a per-instance runtime setting. Pinning a single port (vs the BEAM's
* default ephemeral port) is what makes the dist channel firewallable with a
* NetworkPolicy. The launcher always emits min==max so there is NO ephemeral
* fallback path.
*
* 24369: below the Linux ephemeral range (32768+, avoids kernel port-reuse
* races), unregistered in IANA/known exporters (steers clear of 9090/9100/9200
* Prometheus & friends), and self-documenting — "4369-flavoured" reads as
* Erlang distribution to anyone debugging (epmd is 4369).
*/
#ifndef DIST_PORT
#define DIST_PORT 24369
#endif
/* stringify helpers so DIST_PORT (an int macro) can be passed as an argv string. */
#define STR2(x) #x
#define STR(x) STR2(x)
static void die(const char *what) {
fprintf(stderr, "start: %s: %s\n", what, strerror(errno));
exit(1);
}
/*
* Resolve the release root. The binary lives at <root>/bin/<name>, so the root
* is two directories up from the resolved executable path. This makes the
* launcher location-independent (works whether the release sits at /app or in
* _build/... locally). Falls back to the compile-time RELEASE_ROOT default if
* argv[0] can't be resolved (e.g. invoked via a bare name with no path info).
*/
static void resolve_root(const char *argv0, char *root, size_t root_n) {
char buf[2048];
/* realpath of argv0 -> <root>/bin/<name> */
if (argv0 && realpath(argv0, buf)) {
/* strip /<name> */
char *slash = strrchr(buf, '/');
if (slash) *slash = '\0';
/* strip /bin */
slash = strrchr(buf, '/');
if (slash) {
*slash = '\0';
snprintf(root, root_n, "%s", buf);
return;
}
}
snprintf(root, root_n, "%s", RELEASE_ROOT);
}
/* Read "<erts_vsn> <release_vsn>" from <root>/releases/start_erl.data. */
static void read_versions(const char *root,
char *erts, size_t erts_n, char *vsn, size_t vsn_n) {
char path[2048];
snprintf(path, sizeof path, "%s/releases/start_erl.data", root);
FILE *f = fopen(path, "r");
if (!f) die("open start_erl.data");
char e[128] = {0}, v[128] = {0};
if (fscanf(f, "%127s %127s", e, v) != 2) {
fprintf(stderr, "start: could not parse %s\n", path);
exit(1);
}
fclose(f);
snprintf(erts, erts_n, "%s", e);
snprintf(vsn, vsn_n, "%s", v);
}
/* setenv() that aborts on failure (out of memory only). */
static void set(const char *k, const char *v) {
if (setenv(k, v, 1) != 0) die("setenv");
}
/* setenv only if the var is not already provided by the pod (K8s overrides). */
static void set_default(const char *k, const char *v) {
if (!getenv(k) && setenv(k, v, 1) != 0) die("setenv");
}
#ifndef EPMDLESS
/*
* Pre-start epmd as a detached daemon via a native fork+exec (NO shell).
* epmd self-daemonises with -daemon and exits the foreground process once the
* listener is up, so we wait for that child to reap it. After this returns,
* epmd is listening on ERL_EPMD_PORT (default 4369) and erlexec, seeing it
* already up under -start_epmd false, will not call system().
*
* If epmd is already running (e.g. restarted container, shared netns), the
* second daemon just exits 0 — harmless.
*
* Compiled out entirely in EPMD-less mode (there is no epmd).
*/
static void start_epmd(const char *bindir) {
char epmd[512];
snprintf(epmd, sizeof epmd, "%s/epmd", bindir);
struct stat st;
if (stat(epmd, &st) != 0) {
/* No epmd in the bundled ERTS (unusual) — let erlexec deal with it. */
fprintf(stderr, "start: warning: %s not found, skipping pre-start\n", epmd);
return;
}
pid_t pid = fork();
if (pid < 0) die("fork epmd");
if (pid == 0) {
/* child: become epmd -daemon. -relaxed_command_check is the common
* release setting; omit if you want strict mode. */
char *argv[] = { epmd, "-daemon", NULL };
execv(epmd, argv);
/* only reached on failure */
fprintf(stderr, "start: execv epmd: %s\n", strerror(errno));
_exit(127);
}
/* parent: reap the daemoniser (it forks the real daemon then exits). */
int status = 0;
if (waitpid(pid, &status, 0) < 0) die("waitpid epmd");
if (WIFEXITED(status) && WEXITSTATUS(status) != 0) {
fprintf(stderr, "start: epmd -daemon exited %d (continuing; "
"may already be running)\n", WEXITSTATUS(status));
}
}
#endif /* !EPMDLESS */
/* Resolved per-boot paths, shared by both subcommands. */
struct paths {
char root[1024]; /* release root (resolved from argv[0]) */
char bindir[1100];
char erlexec[1200];
char start_boot[1200]; /* releases/<v>/start (server) */
char clean_boot[1200]; /* releases/<v>/start_clean (eval) */
char cfg[1200]; /* releases/<v>/sys */
char args[1200]; /* releases/<v>/vm.args */
char lib[1100]; /* <root>/lib */
char tmp[1100]; /* <root>/tmp */
char vsn[128];
};
/* Read the COOKIE (RELEASE_COOKIE env, else <root>/releases/COOKIE). Required
* only for the distributed `start`; eval runs non-distributed and ignores it. */
static int read_cookie(const struct paths *p, char *cookie, size_t n) {
const char *env_cookie = getenv("RELEASE_COOKIE");
if (env_cookie && *env_cookie) {
snprintf(cookie, n, "%s", env_cookie);
return 1;
}
char cpath[1200];
snprintf(cpath, sizeof cpath, "%s/releases/COOKIE", p->root);
FILE *cf = fopen(cpath, "r");
if (cf) {
/* n is a small fixed buffer (caller passes sizeof cookie); cap to INT_MAX
* defensively so the size_t->int narrowing fgets() requires is explicit. */
int cap = n > (size_t)INT_MAX ? INT_MAX : (int)n;
if (fgets(cookie, cap, cf)) cookie[strcspn(cookie, "\r\n")] = '\0';
fclose(cf);
}
return cookie[0] != '\0';
}
/* Env every invocation needs: what the erts `erl` script exports for erlexec,
* plus what the config provider reads. Distribution/PHX_SERVER are set per
* subcommand by the caller. */
static void set_common_env(const struct paths *p) {
set("EMU", "beam");
set("ROOTDIR", p->root);
set("BINDIR", p->bindir);
set("PROGNAME", "erl");
set("RELEASE_ROOT", p->root);
set("RELEASE_NAME", RELEASE_NAME);
set("RELEASE_VSN", p->vsn);
set("RELEASE_SYS_CONFIG", p->cfg);
set_default("RELEASE_MODE", "embedded");
set_default("RELEASE_TMP", p->tmp);
}
/*
* Validate the distribution cookie. The cookie is the shared secret guarding
* distributed Erlang — a remote-code-execution channel. We fail CLOSED on a
* missing or obviously-weak cookie rather than silently booting an insecure
* node. (TLS distribution, when enabled, is the primary defence; this is the
* belt to that's suspenders.)
*
* "Weak" heuristics: empty, the literal release name, or shorter than 32 chars.
* A real cookie should be a high-entropy 32+ char secret from a K8s Secret.
* Override the minimum with DIST_COOKIE_MIN_LEN, or set DIST_ALLOW_WEAK_COOKIE=1
* to bypass (intended only for local/CI, never production).
*/
static int cookie_is_acceptable(const char *cookie) {
if (getenv("DIST_ALLOW_WEAK_COOKIE")) return 1;
size_t len = strlen(cookie);
size_t min = 32;
const char *env_min = getenv("DIST_COOKIE_MIN_LEN");
if (env_min && *env_min) {
long v = strtol(env_min, NULL, 10);
if (v > 0) min = (size_t)v;
}
if (len == 0) {
fprintf(stderr, "start: refusing to boot: distribution cookie is empty\n");
return 0;
}
if (len < min) {
fprintf(stderr, "start: refusing to boot: distribution cookie too short "
"(%zu < %zu). Use a high-entropy RELEASE_COOKIE from a "
"Secret, or set DIST_ALLOW_WEAK_COOKIE=1 for local/CI.\n",
len, min);
return 0;
}
if (strcmp(cookie, RELEASE_NAME) == 0) {
fprintf(stderr, "start: refusing to boot: cookie equals the release "
"name (predictable)\n");
return 0;
}
return 1;
}
/* `start`: the long-running, distributed Phoenix server. Mirrors
* rel/overlays/bin/server + rel/env.sh.eex + the frozen `start` argv. */
static int run_start(const struct paths *p) {
set("RELEASE_BOOT_SCRIPT", "start");
/* Distribution (rel/env.sh.eex). POD_IP from the K8s downward API. */
set("RELEASE_DISTRIBUTION", "name");
const char *pod_ip = getenv("POD_IP");
char node[256];
if (pod_ip && *pod_ip) {
snprintf(node, sizeof node, RELEASE_NAME "@%s", pod_ip);
} else {
fprintf(stderr, "start: warning: POD_IP unset, using 127.0.0.1\n");
snprintf(node, sizeof node, RELEASE_NAME "@127.0.0.1");
}
set("RELEASE_NODE", node);
/* Export the pinned dist port so the EpmdLess module (and anything else)
* resolves to the SAME port the VM listens on. */
set_default("DIST_PORT", STR(DIST_PORT));
/* Phoenix: start the HTTP endpoint (runtime.exs gates on PHX_SERVER). */
set("PHX_SERVER", "true");
char cookie[256] = {0};
if (!read_cookie(p, cookie, sizeof cookie)) {
fprintf(stderr, "start: no RELEASE_COOKIE and no releases/COOKIE\n");
return 1;
}
if (!cookie_is_acceptable(cookie)) {
return 1;
}
/*
* TLS distribution. When REQUIRE_TLS is set (the default; the mix step
* passes -DREQUIRE_TLS=1 unless require_tls: false), the launcher ALWAYS
* runs mutual-TLS distribution and refuses to boot if the cert bundle is
* missing — there is no runtime switch to forget. This makes a stolen
* cookie insufficient to join the cluster: a peer must also present a cert
* signed by our dist CA. Certs are mounted read-only at <root>/dist-certs
* (a K8s Secret); the SSL options live in <root>/dist/inet_tls.config.
*
* With require_tls: false (non-clustered apps), distribution is cookie-only.
*/
char optfile[1300];
snprintf(optfile, sizeof optfile, "%s/dist/inet_tls.config", p->root);
#ifdef REQUIRE_TLS
if (access(optfile, R_OK) != 0) {
fprintf(stderr, "start: refusing to boot: TLS dist options file %s is "
"unreadable: %s\n", optfile, strerror(errno));
return 1;
}
static const char *const cert_files[] = {
"dist-certs/dist-cert.pem",
"dist-certs/dist-key.pem",
"dist-certs/dist-ca.pem",
};
for (size_t i = 0; i < sizeof cert_files / sizeof cert_files[0]; i++) {
char path[1300];
snprintf(path, sizeof path, "%s/%s", p->root, cert_files[i]);
if (access(path, R_OK) != 0) {
fprintf(stderr, "start: refusing to boot: distribution cert %s is "
"unreadable: %s\n", path, strerror(errno));
return 1;
}
}
#endif
#ifndef EPMDLESS
/* Pre-start epmd natively so erlexec never shells out for it.
* (Not needed in EPMD-less mode — there is no epmd at all.) */
start_epmd(p->bindir);
#endif
/*
* Build the `start` argv:
* - frozen launcher argv (mode/boot/config/args)
* - cookie + -name (distribution identity)
* - "-start_epmd false" (epmd already up; erlexec must NOT system())
* - "-kernel inet_dist_listen_min/max DIST_PORT" (pin the dist port so it
* is firewallable; min==max => exactly one port, no ephemeral fallback)
* - "-proto_dist inet_tls -ssl_dist_optfile <optfile>" (mTLS; REQUIRE_TLS)
*/
char *argv[40];
int n = 0;
argv[n++] = (char *)p->erlexec;
argv[n++] = "-noshell";
argv[n++] = "-s"; argv[n++] = "elixir"; argv[n++] = "start_cli";
argv[n++] = "-mode"; argv[n++] = "embedded";
argv[n++] = "-setcookie"; argv[n++] = cookie;
argv[n++] = "-name"; argv[n++] = node;
argv[n++] = "-start_epmd"; argv[n++] = "false";
#ifdef EPMDLESS
/* EPMD-less: resolve every node's dist port to a constant via our module,
* so no epmd (and no port 4369) is needed at all. The module reads the same
* DIST_PORT we export below, so listener and resolver always agree. */
argv[n++] = "-epmd_module"; argv[n++] = "Elixir.ShelllessRelease.EpmdLess";
#endif
argv[n++] = "-kernel";
argv[n++] = "inet_dist_listen_min"; argv[n++] = STR(DIST_PORT);
argv[n++] = "inet_dist_listen_max"; argv[n++] = STR(DIST_PORT);
#ifdef REQUIRE_TLS
argv[n++] = "-proto_dist"; argv[n++] = "inet_tls";
argv[n++] = "-ssl_dist_optfile"; argv[n++] = optfile;
#endif
argv[n++] = "-config"; argv[n++] = (char *)p->cfg;
argv[n++] = "-boot"; argv[n++] = (char *)p->start_boot;
argv[n++] = "-boot_var"; argv[n++] = "RELEASE_LIB"; argv[n++] = (char *)p->lib;
argv[n++] = "-args_file"; argv[n++] = (char *)p->args;
argv[n++] = "-extra"; argv[n++] = "--no-halt";
argv[n] = NULL;
execv(p->erlexec, argv);
die("execv erlexec (start)");
return 127;
}
/*
* run_task: run ONE compiled-in expression on a fresh, non-booted,
* non-distributed node, then halt. This is the locked-down replacement for the
* generic `eval EXPR`.
*
* SECURITY: `expr` is NEVER taken from argv. It comes only from the fixed TASKS
* table below, so there is no code path that feeds an attacker-controlled
* string to the BEAM. The verbs are a capability whitelist: even with full
* control of the container's command/args, the only reachable behaviours are
* `server` and the audited migration tasks. (The stock release `eval`/`rpc`/
* `remote` RCE surface is simply not implemented.)
*
* Non-distributed => NO epmd and NO /bin/sh involved; boot script is
* start_clean, no -mode/-name/-setcookie, ends with `-extra --eval EXPR --`.
*/
static int run_task(const struct paths *p, const char *expr) {
set("RELEASE_BOOT_SCRIPT", "start_clean");
/* non-distributed; make sure runtime.exs doesn't start the server. */
set("RELEASE_DISTRIBUTION", "none");
unsetenv("PHX_SERVER");
char *argv[] = {
(char *)p->erlexec,
"-noshell",
"-s", "elixir", "start_cli",
"-config", (char *)p->cfg,
"-boot", (char *)p->clean_boot,
"-boot_var", "RELEASE_LIB", (char *)p->lib,
"-args_file", (char *)p->args,
"-extra", "--eval", (char *)expr, "--",
NULL
};
execv(p->erlexec, argv);
die("execv erlexec (task)");
return 127;
}
/*
* The whitelist. Each verb maps to exactly one compiled-in Elixir expression.
* To add an operation you change config and rebuild — it cannot be injected at
* runtime.
*
* The table is GENERATED from the consuming app's config by the mix release
* step (ShelllessRelease) into shellless_tasks.h, then #included here. This
* keeps the whitelist a compile-time constant (the security property) while
* letting each app declare its own verbs without forking this source.
*
* The generated header must define TASKS_INIT as a brace-enclosed initialiser
* list of {verb, expr} pairs, e.g.:
* #define TASKS_INIT \
* { "migrate", "MyApp.Release.migrate()" }, \
* { "seed", "MyApp.Release.seed()" },
* If no header is provided (e.g. compiling the source standalone for a syntax
* check), TASKS_INIT defaults to empty — server-only, no task verbs.
*
* Tasks that need caller-supplied args (e.g. rollback(repo, version)) are
* intentionally unsupported: a zero-arg fixed verb cannot take an argument, so
* there is no path to smuggle an expression in.
*/
#if defined(__has_include)
# if __has_include("shellless_tasks.h")
# include "shellless_tasks.h"
# endif
#endif
#ifndef TASKS_INIT
#define TASKS_INIT
#endif
struct task { const char *verb; const char *expr; };
static const struct task TASKS[] = { TASKS_INIT };
static const int N_TASKS = (int)(sizeof TASKS / sizeof TASKS[0]);
/* True if `name` is one of the installed entry-point names (used to recognise
* argv[0] when invoked as /app/bin/<name>). "server"/"start" alias the server. */
static int is_known_command(const char *name) {
if (strcmp(name, "server") == 0 || strcmp(name, "start") == 0) return 1;
for (int i = 0; i < N_TASKS; i++)
if (strcmp(name, TASKS[i].verb) == 0) return 1;
return 0;
}
int main(int argc, char **argv) {
char erts[128];
struct paths p;
/* Resolve the release root from our own location (<root>/bin/<name>), so
* the launcher works wherever the release is unpacked, not just /app. */
resolve_root(argv[0], p.root, sizeof p.root);
read_versions(p.root, erts, sizeof erts, p.vsn, sizeof p.vsn);
snprintf(p.bindir, sizeof p.bindir, "%s/erts-%s/bin", p.root, erts);
snprintf(p.erlexec, sizeof p.erlexec, "%s/erlexec", p.bindir);
snprintf(p.start_boot, sizeof p.start_boot, "%s/releases/%s/start", p.root, p.vsn);
snprintf(p.clean_boot, sizeof p.clean_boot, "%s/releases/%s/start_clean", p.root, p.vsn);
snprintf(p.cfg, sizeof p.cfg, "%s/releases/%s/sys", p.root, p.vsn);
snprintf(p.args, sizeof p.args, "%s/releases/%s/vm.args", p.root, p.vsn);
snprintf(p.lib, sizeof p.lib, "%s/lib", p.root);
snprintf(p.tmp, sizeof p.tmp, "%s/tmp", p.root);
set_common_env(&p);
/*
* Command resolution preserves the ORIGINAL interface. This is a multi-call
* binary (busybox-style): it is installed at /app/bin/server,
* /app/bin/migrate, /app/bin/create_import_migrate and
* /app/bin/setup_coaching_request_lines (hardlinks to one inode), so the
* deploy repo keeps invoking the exact same paths it always has — no
* manifest changes needed.
*
* Resolution order:
* 1. basename(argv[0]) — when invoked as /app/bin/<name> (the real,
* unchanged interface). "start"/"server" both mean the server.
* 2. else argv[1] — the subcommand form /app/start <verb>, handy for a
* single CMD and for local use.
*
* Either way the command must be in the whitelist (server | TASKS) and
* takes ZERO arguments; no caller-supplied expression is ever accepted.
*/
const char *base = argv[0] ? argv[0] : "start";
const char *slash = strrchr(base, '/');
if (slash) base = slash + 1;
const char *cmd;
int extra_args;
if (is_known_command(base)) {
/* invoked as /app/bin/<name> */
cmd = base;
extra_args = (argc > 1);
} else {
/* invoked as a launcher with a subcommand (default: server) */
cmd = (argc > 1) ? argv[1] : "server";
extra_args = (argc > 2);
}
if (strcmp(cmd, "server") == 0 || strcmp(cmd, "start") == 0) {
if (extra_args) { fprintf(stderr, "start: '%s' takes no arguments\n", cmd); return 1; }
return run_start(&p);
}
for (int i = 0; i < N_TASKS; i++) {
if (strcmp(cmd, TASKS[i].verb) == 0) {
if (extra_args) { fprintf(stderr, "start: '%s' takes no arguments\n", cmd); return 1; }
return run_task(&p, TASKS[i].expr);
}
}
/* Unknown command: reject and print the whitelist. */
fprintf(stderr, "start: unknown command '%s'\n", cmd);
fprintf(stderr, "allowed: server (default)");
for (int i = 0; i < N_TASKS; i++) fprintf(stderr, " | %s", TASKS[i].verb);
fprintf(stderr, "\n");
return 1;
}