/*
* linx_process -- the Port binary backing `Linx.Process`.
*
* Linx.Process performs operations that cannot live inside the multithreaded
* BEAM: clone(), setns(), fork() and execve(). Doing those in the BEAM
* corrupts the VM, so they run in
* this separate OS process, spawned via Port.open from Elixir.
*
* CONTROL CHANNEL
* ---------------
* The Erlang Port is opened with `:nouse_stdio` and `{:packet, 4}`. That
* leaves fd 0/1/2 free for the workload (P4) and gives us fd 3 (BEAM -> us)
* and fd 4 (us -> BEAM) for control traffic. Every message is a 4-byte
* big-endian length prefix followed by an Erlang External Term Format
* payload; ETF means the BEAM side needs no codec.
*
* TWO MODES
* ---------
* The BEAM sends one of two requests on fd 3:
*
* {:spawn, %{argv, namespaces?, env?}}
* -- clone() a child with the requested CLONE_NEW* flags. The child
* is born in those fresh namespaces.
*
* {:enter, %{target, argv, namespaces?, env?}}
* -- setns() the agent itself into the namespaces of host pid
* `target`, then fork(). The fork's child inherits those
* namespaces. `namespaces` chooses which of the target's
* namespaces to join; if absent, join all of them.
*
* Both modes share the rest of the protocol: the parent reports the host
* pid as {:status, :spawned, _}, the child reaches the checkpoint and
* the parent reports {:status, :ready, child_pid_inside_ns}, the BEAM
* does any host-side setup (optionally including K2 cap_* commands that
* the parent forwards to the child) and replies :proceed, the parent
* forwards that to the child over an internal pipe, the child execve()s
* and the parent reports {:status, :running, _}. On waitpid,
* {:status, :exited, code} or {:status, :signaled, signum} terminates
* the session. Pre-exec failures arrive as {:error, errno, stage}.
*
* THE RELAY
* ---------
* The agent process talks to the BEAM on fd 3/4. The cloned child does not
* touch the BEAM channel; instead, two internal pipes carry the checkpoint
* handshake:
*
* `c2p` (child writes, parent reads, O_CLOEXEC on the child end): a
* stream of {:packet, 4} ei frames, same encoding as p2c and the BEAM
* channel. Recognised frames:
* - {:ready, pidns_internal_child_pid}
* - {:error, errno, stage_atom} -- pre-exec failure; stage is
* :execve / :stdio / :cap_drop_bounding / :cap_set_thread /
* :cap_set_ambient / :seccomp_install / :seccomp_no_new_privs
* (see enum stage / stage_name in this file)
* - EOF (the child execve'd successfully and CLOEXEC closed the
* pipe) -> :running
*
* `p2c` (parent writes, child reads): a stream of {:packet, 4} ei frames,
* same encoding as the BEAM channel. Recognised frames:
* - :proceed -- sentinel that ends the child's checkpoint loop;
* child falls through to apply_stdio + execve
* - {:cap_drop_bounding, mask},
* {:cap_set_thread, eff, prm, inh},
* {:cap_set_ambient, mask} -- K2 capability commands; child
* applies the corresponding prctl/capset syscall per-thread
* - {:seccomp_install, <<bpf>>} -- S2 seccomp install; child sets
* PR_SET_NO_NEW_PRIVS if not already on, then calls
* seccomp(SECCOMP_SET_MODE_FILTER) with the cBPF blob
* - EOF (parent closed without writing :proceed) -> :abort path;
* child _exits 102
*
* The CLOEXEC trick on the c2p pipe is how the parent learns the
* execve succeeded: nothing to write -- the kernel auto-closes the fd at
* exec time, the parent sees EOF, and emits :running. If execve fails,
* the child writes the {:error, errno, stage} frame BEFORE the close-
* on-exec would trigger, so the parent sees the failure with detail.
*
* The parent also polls c2p alongside the BEAM channel during the
* checkpoint window (see await_proceed) so that a K2 cap-command
* failure in the child surfaces as {:error, errno, stage} immediately,
* rather than getting stranded until :proceed is sent.
*
* EXIT CODES (of this agent, not the workload)
* --------------------------------------------
* 0 success (workload was reported on, agent terminating normally)
* 1 I/O failure on the BEAM channel (no emit possible)
* 2 malformed request -- emits {:error, EINVAL, :malformed_request}
* 3 clone()/fork() failed -- emits {:error, errno, :clone | :fork}
* 4 internal infrastructure failure -- emits {:error, errno, stage}
* where stage is :sigprocmask | :pipe2 | :signalfd | :posix_openpt
* | :ptsetup | :ptsname | :pts_open | :ready_frame |
* :malformed_ready | :exec_outcome
*
* Every non-zero exit except code 1 emits a structured error on fd 4
* before bailing -- so the BEAM-side GenServer always sees a clean
* terminal even when the agent dies before sending a :status frame.
* Code 1 paths are silent because the BEAM channel is the failure
* cause; a write would just EPIPE.
*
* The BEAM falls back to a synthesised {:linx_process, :error, exit_code,
* :agent_died} owner message if the agent exits without having emitted
* anything (truly catastrophic -- segfault, OOM-kill, …).
*
* The workload's own exit code is reported as {:status, :exited, code}.
*
* KERNEL FLOOR
* ------------
* This file uses syscalls and constants that need Linux >= 5.8:
*
* - clone(2) namespace flags: CLONE_NEWUSER (3.8+), CLONE_NEWCGROUP
* (4.6+), CLONE_NEWTIME (5.6+), and the older NEW* flags.
* - prctl(PR_CAPBSET_DROP) -- 2.6.25+
* - prctl(PR_CAP_AMBIENT_*) -- 4.3+
* - prctl(PR_SET_NO_NEW_PRIVS) -- 3.5+
* - capset(2) v3 layout -- 2.6.26+
* - CAP_LAST_CAP = 40 (cap_checkpoint_restore) -- 5.8+
* - seccomp(2) SECCOMP_SET_MODE_FILTER -- 3.17+
* - signalfd(2) -- 2.6.22+
* - pipe2(2) -- 2.6.27+
*
* 5.8 is the effective floor (driven by the cap table in
* Linx.Capabilities.Constants). Older kernels may compile, but the
* cap-table forward-compat path will treat any post-2.6.25 missing
* caps as :unknown rather than refusing to run.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <ei.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/capability.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <poll.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/signalfd.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>
#define CTL_IN 3
#define CTL_OUT 4
/* The clone child gets a private 1 MiB stack; it does not recurse, so this is
* ample. clone() needs the *top* of the stack since it grows down. */
#define CHILD_STACK_SIZE (1024 * 1024)
/* Stage tags identifying which step of the pre-exec setup failed.
* Encoded as the third element of a {:error, errno, stage_atom} ei
* frame the child sends to the parent over c2p; the parent forwards
* the atom to the BEAM via emit_error. The integer values are an
* internal enum (used by stage_name); the strings are what the BEAM
* sees. */
enum stage {
STAGE_EXECVE = 1,
STAGE_STDIO = 2, /* per-fd plumbing in child: /dev/null, AF_UNIX connect, PTY ioctl */
STAGE_CAP_DROP_BOUNDING = 3, /* prctl(PR_CAPBSET_DROP) failed in the child */
STAGE_CAP_SET_THREAD = 4, /* capset(2) failed in the child */
STAGE_CAP_SET_AMBIENT = 5, /* prctl(PR_CAP_AMBIENT_*) failed in the child */
STAGE_SECCOMP_NO_NEW_PRIVS = 6, /* prctl(PR_SET_NO_NEW_PRIVS) failed in the child */
STAGE_SECCOMP_INSTALL = 7, /* seccomp(SECCOMP_SET_MODE_FILTER) failed in the child */
STAGE_CHDIR = 8, /* chdir(:cwd) failed in the child before execve */
};
static const char *stage_name(enum stage s)
{
switch (s) {
case STAGE_EXECVE: return "execve";
case STAGE_STDIO: return "stdio";
case STAGE_CAP_DROP_BOUNDING: return "cap_drop_bounding";
case STAGE_CAP_SET_THREAD: return "cap_set_thread";
case STAGE_CAP_SET_AMBIENT: return "cap_set_ambient";
case STAGE_SECCOMP_NO_NEW_PRIVS: return "seccomp_no_new_privs";
case STAGE_SECCOMP_INSTALL: return "seccomp_install";
case STAGE_CHDIR: return "chdir";
}
return "unknown";
}
/* Namespace types the agent knows about. `atom` is the name on the Elixir
* side and on the wire; `proc` is the filename under /proc/<pid>/ns/
* (note that the mount namespace is `mnt` in procfs, not `mount`); `flag`
* is the CLONE_NEW* bit for `clone(2)` in create mode.
*
* The list is in setns-safe order for enter mode: user first (so any
* later setns calls have capabilities in the new user namespace), pid
* last (it only takes effect on future fork()s, so must happen before
* the fork). The order is irrelevant for create mode, where the flags
* are OR'd into a single clone() call. */
struct ns_info {
const char *atom;
const char *proc;
int flag;
};
static const struct ns_info NS_INFO[] = {
{ "user", "user", CLONE_NEWUSER },
{ "mount", "mnt", CLONE_NEWNS },
{ "uts", "uts", CLONE_NEWUTS },
{ "ipc", "ipc", CLONE_NEWIPC },
{ "cgroup", "cgroup", CLONE_NEWCGROUP },
{ "net", "net", CLONE_NEWNET },
{ "time", "time", CLONE_NEWTIME },
{ "pid", "pid", CLONE_NEWPID },
{ NULL, NULL, 0 },
};
/* --- low-level I/O on fd 3/4 -------------------------------------------- */
static int read_exact(int fd, void *buf, size_t count)
{
uint8_t *p = buf;
while (count > 0) {
ssize_t n = read(fd, p, count);
if (n < 0) {
if (errno == EINTR)
continue;
return -1;
}
if (n == 0) {
errno = 0;
return -1;
}
p += n;
count -= (size_t)n;
}
return 0;
}
static int write_exact(int fd, const void *buf, size_t count)
{
const uint8_t *p = buf;
while (count > 0) {
ssize_t n = write(fd, p, count);
if (n < 0) {
if (errno == EINTR)
continue;
return -1;
}
p += n;
count -= (size_t)n;
}
return 0;
}
/* Write one {:packet, 4} frame (4-byte big-endian length + body) to `fd`.
* Used on all three internal channels that carry ei frames:
* - CTL_OUT (agent -> BEAM): status/error events via emit_*
* - p2c (parent -> child): :proceed sentinel + K2 cap commands
* - c2p (child -> parent): {:ready, _} and {:error, _, _} */
static int write_frame_fd(int fd, const void *buf, uint32_t len)
{
uint8_t hdr[4] = {
(uint8_t)(len >> 24), (uint8_t)(len >> 16),
(uint8_t)(len >> 8), (uint8_t)len,
};
if (write_exact(fd, hdr, sizeof hdr) < 0)
return -1;
return write_exact(fd, buf, len);
}
static int write_frame(const void *buf, uint32_t len)
{
return write_frame_fd(CTL_OUT, buf, len);
}
/* Read one {:packet, 4} frame from `fd` into `buf`. Returns the message
* length, or -1 on error/EOF (errno == 0 on EOF, per read_exact). Used
* on all three internal channels that carry ei frames:
* - CTL_IN (BEAM -> agent): request and post-:running commands
* - p2c (parent -> child, read in the child): :proceed + cap commands
* - c2p (child -> parent, read in the agent): :ready / :error frames */
static ssize_t read_frame_fd(int fd, uint8_t *buf, size_t cap)
{
uint8_t hdr[4];
if (read_exact(fd, hdr, sizeof hdr) < 0)
return -1;
uint32_t len = ((uint32_t)hdr[0] << 24) | ((uint32_t)hdr[1] << 16) |
((uint32_t)hdr[2] << 8) | (uint32_t)hdr[3];
if (len > cap) {
errno = EMSGSIZE;
return -1;
}
if (read_exact(fd, buf, len) < 0)
return -1;
return (ssize_t)len;
}
static ssize_t read_frame(uint8_t *buf, size_t cap)
{
return read_frame_fd(CTL_IN, buf, cap);
}
/* --- emitting outbound events on fd 4 ----------------------------------- */
static void emit_buff(ei_x_buff *x)
{
/* EPIPE here is the normal "BEAM port closed underneath us" case --
* the surrounding loops handle the dropped channel, no stderr noise
* needed. Other errors stay loud so real bugs are visible. */
if (write_frame(x->buff, (uint32_t)x->index) < 0 && errno != EPIPE)
fprintf(stderr, "linx_process: write to BEAM: %s\n",
strerror(errno));
ei_x_free(x);
}
/* {:status, atom, integer}. */
static void emit_status_int(const char *kind, long value)
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 3);
ei_x_encode_atom(&x, "status");
ei_x_encode_atom(&x, kind);
ei_x_encode_long(&x, value);
emit_buff(&x);
}
/* {:status, :running} -- no payload. */
static void emit_status_running(void)
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 2);
ei_x_encode_atom(&x, "status");
ei_x_encode_atom(&x, "running");
emit_buff(&x);
}
/* {:error, errno, stage_atom}. */
static void emit_error(int err, const char *stage)
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 3);
ei_x_encode_atom(&x, "error");
ei_x_encode_long(&x, err);
ei_x_encode_atom(&x, stage);
emit_buff(&x);
}
/* --- the request: parse {:spawn, _} or {:enter, _} -------------------- */
enum req_mode { MODE_SPAWN, MODE_ENTER };
/* Per-fd stdio directive. INHERIT leaves the child's fd untouched; DEVNULL
* dup2's /dev/null on; CONNECT_UNIX connects an AF_UNIX stream to `path`
* and dup2's it on. (The whole-stdio PTY mode is handled separately --
* see `pty` below -- since it shares one slave fd across 0/1/2 plus
* setsid + TIOCSCTTY.) */
struct stdio_dir {
enum stdio_kind {
STDIO_INHERIT = 0,
STDIO_DEVNULL,
STDIO_CONNECT_UNIX,
} kind;
char *path; /* CONNECT_UNIX only; malloc'd */
};
/* The parsed shape of an inbound request.
* mode -- which kind of request.
* target -- enter mode: the host pid of the process whose namespaces
* we should join. Unused in spawn mode.
* argv/env -- NULL-terminated arrays of malloc'd C strings (suitable
* for execve directly).
* ns_flags -- OR of CLONE_NEW* flags. In spawn mode: which namespaces
* to create fresh; defaults to 0 (none) if :namespaces is
* omitted. In enter mode: which of the target's namespaces
* to join, when explicitly listed.
* all_ns -- enter mode only: 1 if :namespaces was *not* listed in
* the request, meaning "join every namespace the target
* has". 0 if :namespaces was listed (use ns_flags).
* stdio[] -- per-fd directive for fd 0/1/2. Defaults: INHERIT.
* pty -- 1 if :stdio was the atom :pty; all three fds then point
* at a single PTY slave with the child as session leader.
* Mutually exclusive with `stdio[]`. */
struct request {
enum req_mode mode;
pid_t target;
char **argv;
char **env;
int ns_flags;
int all_ns;
struct stdio_dir stdio[3];
int pty;
int no_new_privs; /* set PR_SET_NO_NEW_PRIVS in child before checkpoint */
char *cwd; /* chdir() target in the child before execve; NULL = inherit */
};
static void free_str_array(char **arr)
{
if (!arr)
return;
for (char **p = arr; *p; p++)
free(*p);
free(arr);
}
static void free_request(struct request *r)
{
free_str_array(r->argv);
free_str_array(r->env);
free(r->cwd);
for (int i = 0; i < 3; i++)
free(r->stdio[i].path);
}
/* Decode a binary or string ETF term into a freshly malloc'd NUL-terminated
* C string. */
static int decode_string(const char *buf, int *idx, char **out)
{
int type, sz;
if (ei_get_type(buf, idx, &type, &sz) < 0)
return -1;
*out = malloc((size_t)sz + 1);
if (!*out)
return -1;
if (type == ERL_BINARY_EXT) {
long got;
if (ei_decode_binary(buf, idx, *out, &got) < 0) {
free(*out);
*out = NULL;
return -1;
}
(*out)[got] = '\0';
return 0;
}
/* A string of all-ASCII bytes can arrive as STRING_EXT (a list of
* small ints in disguise). Decode either way. */
if (ei_decode_string(buf, idx, *out) < 0) {
free(*out);
*out = NULL;
return -1;
}
return 0;
}
/* Decode a list of binaries into a NULL-terminated argv-style array. */
static int decode_string_list(const char *buf, int *idx, char ***out)
{
int arity;
if (ei_decode_list_header(buf, idx, &arity) < 0)
return -1;
char **arr = calloc((size_t)arity + 1, sizeof(char *));
if (!arr)
return -1;
for (int i = 0; i < arity; i++) {
if (decode_string(buf, idx, &arr[i]) < 0) {
free_str_array(arr);
return -1;
}
}
/* List tail: an empty list (NIL_EXT) unless arity was 0. */
if (arity > 0) {
int t, s;
ei_get_type(buf, idx, &t, &s);
if (t == ERL_NIL_EXT) {
int dummy;
ei_decode_list_header(buf, idx, &dummy);
}
}
*out = arr;
return 0;
}
/* Decode a list of namespace atoms into a CLONE_NEW* bitmask. */
static int decode_ns_list(const char *buf, int *idx, int *flags_out)
{
int arity;
if (ei_decode_list_header(buf, idx, &arity) < 0)
return -1;
int flags = 0;
for (int i = 0; i < arity; i++) {
char atom[MAXATOMLEN];
if (ei_decode_atom(buf, idx, atom) < 0)
return -1;
int matched = 0;
for (const struct ns_info *info = NS_INFO; info->atom; info++) {
if (strcmp(atom, info->atom) == 0) {
flags |= info->flag;
matched = 1;
break;
}
}
if (!matched)
return -1;
}
if (arity > 0) {
int t, s;
ei_get_type(buf, idx, &t, &s);
if (t == ERL_NIL_EXT) {
int dummy;
ei_decode_list_header(buf, idx, &dummy);
}
}
*flags_out = flags;
return 0;
}
/* Decode a per-fd stdio directive, one of:
* :inherit -- ERL_SMALL_ATOM_UTF8_EXT or similar
* :devnull
* {:connect_unix, "path"} -- a 2-tuple
* Stores the result in `out`. Returns 0 on success, -1 on bad shape. */
static int decode_stdio_directive(const char *buf, int *idx, struct stdio_dir *out)
{
int type, sz;
if (ei_get_type(buf, idx, &type, &sz) < 0)
return -1;
if (type == ERL_SMALL_ATOM_UTF8_EXT || type == ERL_ATOM_UTF8_EXT ||
type == ERL_ATOM_EXT || type == ERL_SMALL_ATOM_EXT) {
char atom[MAXATOMLEN];
if (ei_decode_atom(buf, idx, atom) < 0)
return -1;
if (strcmp(atom, "inherit") == 0) {
out->kind = STDIO_INHERIT;
return 0;
}
if (strcmp(atom, "devnull") == 0) {
out->kind = STDIO_DEVNULL;
return 0;
}
return -1;
}
if (type == ERL_SMALL_TUPLE_EXT || type == ERL_LARGE_TUPLE_EXT) {
int arity;
if (ei_decode_tuple_header(buf, idx, &arity) < 0 || arity != 2)
return -1;
char tag[MAXATOMLEN];
if (ei_decode_atom(buf, idx, &tag[0]) < 0)
return -1;
if (strcmp(tag, "connect_unix") != 0)
return -1;
if (decode_string(buf, idx, &out->path) < 0)
return -1;
out->kind = STDIO_CONNECT_UNIX;
return 0;
}
return -1;
}
/* Decode the :stdio value, which is either an atom shorthand
* (:inherit | :devnull | :pty) or a keyword list of `[stdin: dir,
* stdout: dir, stderr: dir]`. Stores results into req->stdio[] and
* req->pty. */
static int decode_stdio(const char *buf, int *idx, struct request *req)
{
int type, sz;
if (ei_get_type(buf, idx, &type, &sz) < 0)
return -1;
if (type == ERL_SMALL_ATOM_UTF8_EXT || type == ERL_ATOM_UTF8_EXT ||
type == ERL_ATOM_EXT || type == ERL_SMALL_ATOM_EXT) {
char atom[MAXATOMLEN];
if (ei_decode_atom(buf, idx, atom) < 0)
return -1;
if (strcmp(atom, "inherit") == 0) {
/* default already; no change */
return 0;
}
if (strcmp(atom, "devnull") == 0) {
for (int i = 0; i < 3; i++)
req->stdio[i].kind = STDIO_DEVNULL;
return 0;
}
if (strcmp(atom, "pty") == 0) {
req->pty = 1;
return 0;
}
return -1;
}
/* A keyword list arrives as LIST_EXT of 2-tuples, ending in NIL_EXT. */
int arity;
if (ei_decode_list_header(buf, idx, &arity) < 0)
return -1;
for (int i = 0; i < arity; i++) {
int tarity;
if (ei_decode_tuple_header(buf, idx, &tarity) < 0 || tarity != 2)
return -1;
char key[MAXATOMLEN];
if (ei_decode_atom(buf, idx, key) < 0)
return -1;
int fd = -1;
if (strcmp(key, "stdin") == 0) fd = 0;
else if (strcmp(key, "stdout") == 0) fd = 1;
else if (strcmp(key, "stderr") == 0) fd = 2;
else return -1;
if (decode_stdio_directive(buf, idx, &req->stdio[fd]) < 0)
return -1;
}
if (arity > 0) {
ei_get_type(buf, idx, &type, &sz);
if (type == ERL_NIL_EXT) {
int dummy;
ei_decode_list_header(buf, idx, &dummy);
}
}
return 0;
}
/* Decode the inbound request, either:
* {:spawn, %{argv, namespaces?, env?, stdio?}}
* {:enter, %{target, argv, namespaces?, env?, stdio?}}
* Returns 0 on success, -1 on malformed input. */
static int decode_request(const uint8_t *buf, int len, struct request *req)
{
(void)len;
/* Sane defaults. all_ns is meaningful only in enter mode and is
* lowered the moment the caller mentioned :namespaces explicitly. */
req->all_ns = 1;
int idx = 0, version;
if (ei_decode_version((const char *)buf, &idx, &version) < 0)
return -1;
int arity;
if (ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0 ||
arity != 2)
return -1;
char tag[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0)
return -1;
if (strcmp(tag, "spawn") == 0) {
req->mode = MODE_SPAWN;
} else if (strcmp(tag, "enter") == 0) {
req->mode = MODE_ENTER;
} else {
return -1;
}
if (ei_decode_map_header((const char *)buf, &idx, &arity) < 0)
return -1;
for (int i = 0; i < arity; i++) {
char key[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, key) < 0)
return -1;
if (strcmp(key, "argv") == 0) {
if (decode_string_list((const char *)buf, &idx, &req->argv) < 0)
return -1;
} else if (strcmp(key, "env") == 0) {
if (decode_string_list((const char *)buf, &idx, &req->env) < 0)
return -1;
} else if (strcmp(key, "namespaces") == 0) {
req->all_ns = 0;
if (decode_ns_list((const char *)buf, &idx, &req->ns_flags) < 0)
return -1;
} else if (strcmp(key, "target") == 0) {
long t;
if (ei_decode_long((const char *)buf, &idx, &t) < 0 ||
t <= 0)
return -1;
req->target = (pid_t)t;
} else if (strcmp(key, "stdio") == 0) {
if (decode_stdio((const char *)buf, &idx, req) < 0)
return -1;
} else if (strcmp(key, "cwd") == 0) {
if (decode_string((const char *)buf, &idx, &req->cwd) < 0)
return -1;
} else if (strcmp(key, "no_new_privs") == 0) {
/* Boolean. ei_decode_boolean wants `int *`. */
int b;
if (ei_decode_boolean((const char *)buf, &idx, &b) < 0)
return -1;
req->no_new_privs = b ? 1 : 0;
} else {
/* Skip unknown keys -- the BEAM may carry extras we
* don't yet understand; future-compatibility. */
ei_skip_term((const char *)buf, &idx);
}
}
if (!req->argv || !req->argv[0])
return -1;
if (req->mode == MODE_ENTER && req->target <= 0)
return -1;
return 0;
}
/* --- entering an existing target's namespaces (P3) --------------------- */
/* Walk the canonical NS_INFO list and join the target's namespaces. The
* order in NS_INFO is setns-safe: user first (so later calls have the
* capabilities a fresh user namespace grants), pid last (it only takes
* effect on future fork()s, so must precede the fork below).
*
* Two modes:
* req->all_ns == 1 -- :namespaces was *not* in the request. Join every
* namespace the target has; silently skip a type whose
* /proc/<pid>/ns file is missing (e.g. CLONE_NEWTIME on an old
* kernel).
* req->all_ns == 0 -- :namespaces was listed. Join exactly the ones
* whose flag is set in req->ns_flags; any failure surfaces as
* {:error, errno, :open_ns | :setns} on fd 4.
*
* On a real failure (in either mode), emits :error and returns -1. */
/* The agent and target share a given namespace iff their /proc/<pid>/ns/<type>
* files point at the same inode. Used to skip no-op setns calls -- entering
* the namespace you're already in returns EINVAL on some kernels (notably
* the user namespace), and is wasteful even where it doesn't. */
static int same_namespace(pid_t target, const char *proc_name)
{
char self_path[64], target_path[64];
snprintf(self_path, sizeof self_path, "/proc/self/ns/%s", proc_name);
snprintf(target_path, sizeof target_path, "/proc/%d/ns/%s",
(int)target, proc_name);
struct stat ss, ts;
if (stat(self_path, &ss) < 0 || stat(target_path, &ts) < 0)
return 0;
return ss.st_ino == ts.st_ino && ss.st_dev == ts.st_dev;
}
static int enter_target_namespaces(const struct request *req)
{
for (const struct ns_info *info = NS_INFO; info->atom; info++) {
if (!req->all_ns && !(req->ns_flags & info->flag))
continue;
/* Already in the target's namespace of this type -- no setns
* needed; some kernels return EINVAL for setns-to-self. */
if (same_namespace(req->target, info->proc))
continue;
char path[64];
snprintf(path, sizeof path, "/proc/%d/ns/%s",
(int)req->target, info->proc);
int fd = open(path, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
if (req->all_ns && errno == ENOENT)
continue;
/* Error stage names the namespace so the BEAM-side
* error can pinpoint which type failed -- e.g.
* :open_ns_time, :setns_user. */
char stage[32];
snprintf(stage, sizeof stage, "open_ns_%s", info->atom);
emit_error(errno, stage);
return -1;
}
if (setns(fd, 0) < 0) {
int err = errno;
close(fd);
char stage[32];
snprintf(stage, sizeof stage, "setns_%s", info->atom);
emit_error(err, stage);
return -1;
}
close(fd);
}
return 0;
}
/* --- the cloned child --------------------------------------------------- */
/* Arguments handed to child_fn via clone's `arg` pointer.
*
* stdio -- per-fd directives. The child applies them after :proceed
* but before execve.
* pty_slave -- if >= 0, the child closes pty_master, sets up a new
* session (setsid), makes pty_slave its controlling TTY
* (TIOCSCTTY), and dups it onto fd 0/1/2. The per-fd
* stdio[] is ignored in PTY mode.
* pty_master -- the parent's end of the PTY pair. The child closes it
* before execve. */
struct child_args {
int c2p_w; /* child writes events here (CLOEXEC) */
int p2c_r; /* child reads commands here */
int c2p_r; /* parent's read end -- child closes on entry */
int p2c_w; /* parent's write end -- child closes on entry */
char **argv;
char **env;
const char *cwd; /* chdir() here before execve; NULL = inherit the agent's cwd */
struct stdio_dir stdio[3];
int pty_master;
int pty_slave;
int no_new_privs; /* call apply_no_new_privs() early in child_fn */
};
/* Report an in-child pre-exec failure as a `{:error, errno, stage_atom}`
* ei frame on the c2p pipe and exit. Called when something between the
* checkpoint and execve fails -- e.g. opening /dev/null, connecting to
* the AF_UNIX path, ioctl on the PTY slave, capset/prctl in the K2 cap
* commands. The parent reads the frame in await_exec_outcome (post-
* proceed) or in await_proceed's c2p poll branch (checkpoint window),
* and forwards the {:error, _, _} to the BEAM. */
__attribute__((noreturn))
static void child_fail(int c2p_w, int err, enum stage stage)
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 3);
ei_x_encode_atom(&x, "error");
ei_x_encode_long(&x, err);
ei_x_encode_atom(&x, stage_name(stage));
(void)write_frame_fd(c2p_w, x.buff, (uint32_t)x.index);
ei_x_free(&x);
_exit(127);
}
/* Apply stdio plumbing in the child before execve. Returns 0 on success,
* -1 on failure (caller should report and exit). */
static int apply_stdio(struct child_args *ca)
{
if (ca->pty_slave >= 0) {
/* Whole-stdio PTY mode. Drop the master copy the child
* inherited from the fork, become session leader, take the
* PTY slave as the controlling terminal, then dup it onto
* fd 0/1/2. */
if (ca->pty_master >= 0)
close(ca->pty_master);
if (setsid() < 0)
return -1;
if (ioctl(ca->pty_slave, TIOCSCTTY, 0) < 0)
return -1;
for (int fd = 0; fd < 3; fd++) {
if (dup2(ca->pty_slave, fd) < 0)
return -1;
}
if (ca->pty_slave > 2)
close(ca->pty_slave);
return 0;
}
/* Per-fd directives. */
for (int fd = 0; fd < 3; fd++) {
switch (ca->stdio[fd].kind) {
case STDIO_INHERIT:
break;
case STDIO_DEVNULL: {
int n = open("/dev/null", O_RDWR | O_CLOEXEC);
if (n < 0)
return -1;
if (dup2(n, fd) < 0) {
close(n);
return -1;
}
close(n);
break;
}
case STDIO_CONNECT_UNIX: {
int s = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
if (s < 0)
return -1;
struct sockaddr_un addr = { .sun_family = AF_UNIX };
size_t len = strlen(ca->stdio[fd].path);
if (len >= sizeof addr.sun_path) {
close(s);
errno = ENAMETOOLONG;
return -1;
}
memcpy(addr.sun_path, ca->stdio[fd].path, len + 1);
if (connect(s, (struct sockaddr *)&addr, sizeof addr) < 0) {
int e = errno;
close(s);
errno = e;
return -1;
}
if (dup2(s, fd) < 0) {
int e = errno;
close(s);
errno = e;
return -1;
}
close(s);
break;
}
}
}
return 0;
}
/* --- K2 capability syscalls (per-thread, called from the child) -------- */
/* Drop every bit set in `mask` from the calling thread's bounding set via
* prctl(PR_CAPBSET_DROP). One-way; returns -1 with errno on first failure
* (we don't try to continue past a denied drop -- the caller treats this
* as a pre-exec failure and exits). */
static int apply_cap_drop_bounding(uint64_t mask)
{
for (int bit = 0; bit < 64; bit++) {
if (mask & ((uint64_t)1 << bit)) {
if (prctl(PR_CAPBSET_DROP, (unsigned long)bit, 0UL, 0UL, 0UL) < 0)
return -1;
}
}
return 0;
}
/* Set the calling thread's effective/permitted/inheritable sets via
* capset(2). We use the kernel's v3 64-bit layout (two cap_data_struct
* entries, low 32 bits in [0], high 32 bits in [1]). syscall(SYS_capset)
* is used directly rather than linking libcap. */
static int apply_cap_set_thread(uint64_t e, uint64_t p, uint64_t i)
{
struct __user_cap_header_struct hdr = {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0, /* current thread */
};
struct __user_cap_data_struct data[2];
data[0].effective = (uint32_t)(e & 0xFFFFFFFFu);
data[0].permitted = (uint32_t)(p & 0xFFFFFFFFu);
data[0].inheritable = (uint32_t)(i & 0xFFFFFFFFu);
data[1].effective = (uint32_t)(e >> 32);
data[1].permitted = (uint32_t)(p >> 32);
data[1].inheritable = (uint32_t)(i >> 32);
return (int)syscall(SYS_capset, &hdr, data);
}
/* Replace the calling thread's ambient set with exactly the caps in
* `mask`. The kernel only exposes per-cap RAISE/LOWER plus a global
* CLEAR_ALL, so the natural shape is "clear, then raise the desired
* caps." Requires Linux 4.3+. */
static int apply_cap_set_ambient(uint64_t mask)
{
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0UL, 0UL, 0UL) < 0)
return -1;
for (int bit = 0; bit < 64; bit++) {
if (mask & ((uint64_t)1 << bit)) {
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE,
(unsigned long)bit, 0UL, 0UL) < 0)
return -1;
}
}
return 0;
}
/* --- S2 seccomp syscalls (per-thread, called from the child) ----------- */
/* prctl(PR_SET_NO_NEW_PRIVS, 1) -- forbid this thread and its descendants
* from ever gaining new privileges via setuid/file-caps on execve. This
* is the precondition the kernel demands before an unprivileged caller
* can install a seccomp filter (without CAP_SYS_ADMIN); we also expose
* it as an option on `Linx.Process.spawn/1` for callers who want the
* security posture without seccomp itself.
*
* One-way: once set, NNP stays on across execve and clone. Linux 3.5+.
* Returns -1 on failure (errno preserved). */
static int apply_no_new_privs(void)
{
return prctl(PR_SET_NO_NEW_PRIVS, 1UL, 0UL, 0UL, 0UL);
}
/* PR_GET_NO_NEW_PRIVS returns 0 if NNP is off, 1 if on. Negative on
* the (unlikely) failure case -- we treat that as "off" and let the
* subsequent set attempt surface the real error. */
static int get_no_new_privs(void)
{
int r = prctl(PR_GET_NO_NEW_PRIVS, 0UL, 0UL, 0UL, 0UL);
return r < 0 ? 0 : r;
}
/* Install the cBPF program `bpf` (len bytes, must be a multiple of 8 --
* struct sock_filter is 8 bytes) as a seccomp filter on the calling
* thread.
*
* Direct `syscall(SYS_seccomp, ...)` so we don't depend on the libc
* wrapper (added in glibc 2.27); the linx-process binary should run
* on older systems too. Linux 3.17+ for the seccomp(2) entry point.
*
* Returns 0 on success, -1 with errno on failure. EINVAL is the usual
* "malformed BPF" code. */
static int apply_seccomp(const void *bpf, size_t len)
{
if (len == 0 || (len % sizeof(struct sock_filter)) != 0) {
errno = EINVAL;
return -1;
}
size_t n = len / sizeof(struct sock_filter);
if (n > 0xFFFF) {
/* struct sock_fprog.len is a u16; > 65535 instructions
* can't be represented. (Real filters are 5..a few
* hundred; this is defensive.) */
errno = E2BIG;
return -1;
}
struct sock_fprog prog = {
.len = (unsigned short)n,
.filter = (struct sock_filter *)bpf,
};
return (int)syscall(SYS_seccomp, SECCOMP_SET_MODE_FILTER, 0UL, &prog);
}
/* Read + dispatch one checkpoint-window command frame from `p2c_r`.
* Returns:
* 0 -- a cap_* command was applied successfully; caller should loop
* and read the next frame.
* 1 -- :proceed received; caller should fall through to apply_stdio
* + execve.
* -1 -- read error or EOF (abort path); caller should _exit(102).
* -2 -- protocol error (unknown command or malformed frame); caller
* should _exit(103).
*
* On a cap-apply syscall failure, this function child_fail's directly
* with the appropriate stage; it does not return. */
static int child_read_command(int p2c_r, int c2p_w)
{
/* Buffer needs to accommodate the largest checkpoint command. K2 cap
* commands are tiny (a few u64s); seccomp_install carries a binary
* cBPF blob -- 8 bytes per instruction, hundreds of instructions for
* realistic filters. 8 KiB fits ~1000 instructions including ei
* encoding overhead, well over any practical filter (the hand-
* curated syscall tables have < 250 entries). The matching forward-
* side buffer in await_proceed is the same size. */
uint8_t buf[8192];
ssize_t len = read_frame_fd(p2c_r, buf, sizeof buf);
if (len < 0) {
/* EOF (errno == 0) means the parent closed p2c without
* sending :proceed -- the abort path. Real read errors
* land here too; both should _exit(102). */
return -1;
}
int idx = 0, version;
if (ei_decode_version((const char *)buf, &idx, &version) < 0)
return -2;
int type, size;
if (ei_get_type((const char *)buf, &idx, &type, &size) < 0)
return -2;
/* Bare :proceed atom -- the sentinel that ends the loop. */
if (type == ERL_SMALL_ATOM_UTF8_EXT || type == ERL_ATOM_UTF8_EXT ||
type == ERL_ATOM_EXT || type == ERL_SMALL_ATOM_EXT) {
char atom[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, atom) < 0)
return -2;
if (strcmp(atom, "proceed") == 0)
return 1;
return -2;
}
/* Tuple commands: {:cap_drop_bounding, mask},
* {:cap_set_thread, e, p, i}, {:cap_set_ambient, mask}. */
if (type != ERL_SMALL_TUPLE_EXT && type != ERL_LARGE_TUPLE_EXT)
return -2;
int arity;
if (ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0)
return -2;
char tag[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0)
return -2;
if (strcmp(tag, "cap_drop_bounding") == 0 && arity == 2) {
unsigned long long mask;
if (ei_decode_ulonglong((const char *)buf, &idx, &mask) < 0)
return -2;
if (apply_cap_drop_bounding((uint64_t)mask) < 0)
child_fail(c2p_w, errno, STAGE_CAP_DROP_BOUNDING);
return 0;
}
if (strcmp(tag, "cap_set_thread") == 0 && arity == 4) {
unsigned long long e, p, i;
if (ei_decode_ulonglong((const char *)buf, &idx, &e) < 0 ||
ei_decode_ulonglong((const char *)buf, &idx, &p) < 0 ||
ei_decode_ulonglong((const char *)buf, &idx, &i) < 0)
return -2;
if (apply_cap_set_thread((uint64_t)e, (uint64_t)p, (uint64_t)i) < 0)
child_fail(c2p_w, errno, STAGE_CAP_SET_THREAD);
return 0;
}
if (strcmp(tag, "cap_set_ambient") == 0 && arity == 2) {
unsigned long long mask;
if (ei_decode_ulonglong((const char *)buf, &idx, &mask) < 0)
return -2;
if (apply_cap_set_ambient((uint64_t)mask) < 0)
child_fail(c2p_w, errno, STAGE_CAP_SET_AMBIENT);
return 0;
}
/* {:seccomp_install, <<bpf>>} -- the S2 seccomp install command.
* seccomp(SECCOMP_SET_MODE_FILTER) requires either CAP_SYS_ADMIN
* or PR_SET_NO_NEW_PRIVS to be on. If NNP isn't on we set it now
* ("be helpful" per PLAN.md D2 -- callers who forgot the spawn
* opt shouldn't get a confusing EPERM from the install). NNP is
* a one-way bit and harmless when set redundantly. */
if (strcmp(tag, "seccomp_install") == 0 && arity == 2) {
int btype, bsize;
if (ei_get_type((const char *)buf, &idx, &btype, &bsize) < 0)
return -2;
if (btype != ERL_BINARY_EXT)
return -2;
/* The BPF binary lives inline in `buf` after our 4-byte
* binary header. ei_decode_binary copies it out; we then
* hand the copy to apply_seccomp and free after. */
if (bsize <= 0)
child_fail(c2p_w, EINVAL, STAGE_SECCOMP_INSTALL);
void *bpf = malloc((size_t)bsize);
if (!bpf)
child_fail(c2p_w, ENOMEM, STAGE_SECCOMP_INSTALL);
long got;
if (ei_decode_binary((const char *)buf, &idx, bpf, &got) < 0) {
free(bpf);
return -2;
}
if (!get_no_new_privs()) {
if (apply_no_new_privs() < 0) {
int err = errno;
free(bpf);
child_fail(c2p_w, err, STAGE_SECCOMP_NO_NEW_PRIVS);
}
}
if (apply_seccomp(bpf, (size_t)got) < 0) {
int err = errno;
free(bpf);
child_fail(c2p_w, err, STAGE_SECCOMP_INSTALL);
}
free(bpf);
return 0;
}
return -2;
}
/* Inside the cloned child: announce :ready (with our pidns-internal pid),
* loop on checkpoint commands until :proceed, plumb stdio, exec. Any
* pre-exec failure is reported as a {:error, errno, stage} ei frame
* on the c2p pipe and the child exits non-zero. */
static int child_fn(void *arg)
{
struct child_args *ca = arg;
/* Close the parent's ends of our internal pipes. clone(2) and
* fork(2) both give the child the full inherited fd table --
* including the parent's c2p[0] (read end) and p2c[1] (write
* end) -- and unless we close them here, closing them in the
* parent leaves the kernel still counting one writer (us) on
* p2c, so the child's read on p2c_r would never see EOF if
* the parent abandons the session. That matters for the
* :abort path. */
if (ca->c2p_r >= 0) close(ca->c2p_r);
if (ca->p2c_w >= 0) close(ca->p2c_w);
/* If the caller asked for PR_SET_NO_NEW_PRIVS at spawn time (the D2
* spawn-time NNP path -- both the principled home for NNP as a
* security posture *and* the precondition for unprivileged seccomp
* installs at the checkpoint), set it now. The cap-command and
* seccomp_install branches below also auto-set NNP if needed (the
* "be helpful" path), but doing it here keeps the workload's
* pre-checkpoint state predictable for callers who explicitly asked. */
if (ca->no_new_privs) {
if (apply_no_new_privs() < 0)
child_fail(ca->c2p_w, errno, STAGE_SECCOMP_NO_NEW_PRIVS);
}
/* :ready -- send {:ready, pidns_internal_pid} as an ei frame. */
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 2);
ei_x_encode_atom(&x, "ready");
ei_x_encode_long(&x, (long)getpid());
int rc = write_frame_fd(ca->c2p_w, x.buff, (uint32_t)x.index);
ei_x_free(&x);
if (rc < 0)
_exit(101);
}
/* Loop on checkpoint-window commands. {:cap_*, _} tuples apply
* per-thread cap syscalls (K2); {:seccomp_install, _} installs
* a cBPF filter (S2); :proceed breaks the loop. A closed p2c
* (EOF) is the :abort path -- exit 102. */
for (;;) {
int r = child_read_command(ca->p2c_r, ca->c2p_w);
if (r == 1) break; /* :proceed */
if (r == 0) continue; /* cap_* applied, next command */
if (r == -1) _exit(102); /* EOF / abort */
_exit(103); /* protocol error */
}
/* Stdio plumbing (P4): dup2 /dev/null or an AF_UNIX socket onto
* 0/1/2, or set up the PTY slave as a controlling tty. */
if (apply_stdio(ca) < 0)
child_fail(ca->c2p_w, errno, STAGE_STDIO);
/* Unblock SIGCHLD before execve so the workload sees default
* signal-mask semantics -- the agent had it blocked so signalfd
* could capture it, but the child inherits the mask across
* execve and would surprise the workload otherwise. */
sigset_t mask;
sigemptyset(&mask);
sigaddset(&mask, SIGCHLD);
sigprocmask(SIG_UNBLOCK, &mask, NULL);
/* Set the workload's working directory. Done last, just before
* execve: after any rootfs pivot the agent's inherited cwd may no
* longer exist, so :cwd (typically the image's WorkingDir, or "/")
* gives the workload a valid cwd inside its own root. */
if (ca->cwd && chdir(ca->cwd) < 0)
child_fail(ca->c2p_w, errno, STAGE_CHDIR);
execve(ca->argv[0], ca->argv, ca->env);
/* execve returned -> failure. */
child_fail(ca->c2p_w, errno, STAGE_EXECVE);
}
/* --- the relay (parent of clone) ---------------------------------------- */
/* Drain c2p until either: the child reported success (EOF on the pipe
* because of CLOEXEC after execve), or a pre-exec error arrived as a
* `{:error, errno, stage_atom}` ei frame.
*
* Returns 0 on success (the workload is running). Returns 1 on
* pre-exec error (already emitted on fd 4). Returns -1 on relay failure. */
static int await_exec_outcome(int c2p_r)
{
uint8_t buf[256];
ssize_t len = read_frame_fd(c2p_r, buf, sizeof buf);
if (len < 0) {
/* read_exact sets errno=0 on EOF; that's the success
* path (CLOEXEC closed the child's write end at exec). */
return errno == 0 ? 0 : -1;
}
int idx = 0, version, arity;
if (ei_decode_version((const char *)buf, &idx, &version) < 0 ||
ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0 ||
arity != 3)
return -1;
char tag[MAXATOMLEN];
long err;
char stage[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0 ||
strcmp(tag, "error") != 0 ||
ei_decode_long((const char *)buf, &idx, &err) < 0 ||
ei_decode_atom((const char *)buf, &idx, stage) < 0)
return -1;
emit_error((int)err, stage);
return 1;
}
/* Block on the BEAM control channel until :proceed (or :abort) arrives,
* handling pre-proceed commands valid during the checkpoint window:
*
* * {:pty_winsize, _} -- applied in-agent on `pty_master` via TIOCSWINSZ.
* The child doesn't need to know.
*
* * {:cap_drop_bounding, _}, {:cap_set_thread, _, _, _},
* {:cap_set_ambient, _} -- K2 capability commands. The agent can't
* apply these on the child's behalf (capset/prctl are per-thread),
* so we forward the frame verbatim to `p2c_w` and the child applies
* it before execve.
*
* * {:seccomp_install, <<bpf>>} -- S2 seccomp install. Same per-thread
* constraint as the cap commands -- the agent forwards verbatim and
* the child does the seccomp(2) syscall before execve.
*
* * :proceed -- forwarded as a frame to `p2c_w` (the sentinel that
* ends the child's checkpoint-command loop). Returns 0.
*
* * :abort -- caller closes `p2c_w` so the child sees EOF and _exits.
* Returns 1.
*
* `pty_master` is the agent's master fd in PTY mode (or -1 otherwise).
* `p2c_w` is the write end of the agent->child unblock pipe.
* `c2p_r` is the read end of the child->agent status pipe; we poll it
* here so a cap-command failure in the child surfaces as a
* {:linx_process, :error, errno, stage} on the BEAM even though the
* checkpoint hasn't proceeded yet. (Pre-K2, c2p was only consumed by
* await_exec_outcome after :proceed.)
*
* {:signal, _} and {:pty_in, _} are post-running-only and treated as
* protocol errors if they show up here.
*
* Returns:
* 0 -- :proceed forwarded to child; caller closes p2c_w and waits
* for execve outcome on c2p.
* 1 -- :abort received; caller closes p2c_w to deliver EOF to the
* child, reaps, and emits {:status, :aborted, child_pid}.
* -1 -- read/parse error, unknown command, or child failure during
* a cap command (emit_error already called for that case). */
static int await_proceed(int pty_master, int p2c_w, int c2p_r)
{
for (;;) {
/* Multiplex BEAM commands on CTL_IN with child failure
* notifications on c2p_r. The latter is only relevant
* during the K2 cap-command window -- a cap_* command
* that the child can't apply (EPERM on capset, etc.)
* arrives as a {:error, errno, stage} ei frame, ahead
* of any :proceed. */
struct pollfd pfds[2] = {
{ .fd = CTL_IN, .events = POLLIN },
{ .fd = c2p_r, .events = POLLIN },
};
int rc = poll(pfds, 2, -1);
if (rc < 0) {
if (errno == EINTR)
continue;
return -1;
}
if (pfds[1].revents & (POLLIN | POLLHUP)) {
/* Child wrote a {:error, errno, stage_atom} ei
* frame (or unexpectedly closed). Drain it and
* surface to BEAM; main cleans up. */
uint8_t buf[256];
ssize_t len = read_frame_fd(c2p_r, buf, sizeof buf);
if (len < 0)
return -1;
int idx = 0, version, arity;
if (ei_decode_version((const char *)buf, &idx, &version) < 0 ||
ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0 ||
arity != 3)
return -1;
char tag[MAXATOMLEN];
long err;
char stage[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0 ||
strcmp(tag, "error") != 0 ||
ei_decode_long((const char *)buf, &idx, &err) < 0 ||
ei_decode_atom((const char *)buf, &idx, stage) < 0)
return -1;
emit_error((int)err, stage);
return -1;
}
if (!(pfds[0].revents & POLLIN)) {
/* CTL_IN closed or errored -- the BEAM port is
* gone; treat as -1 (main cleans up). */
if (pfds[0].revents & (POLLHUP | POLLERR | POLLNVAL))
return -1;
continue;
}
/* Same 8 KiB ceiling as child_read_command -- this buffer
* has to accommodate {:seccomp_install, <<bpf>>} before
* we forward it verbatim to p2c. */
uint8_t buf[8192];
ssize_t len = read_frame(buf, sizeof buf);
if (len < 0)
return -1;
int idx = 0, version;
if (ei_decode_version((const char *)buf, &idx, &version) < 0)
return -1;
int type, size;
if (ei_get_type((const char *)buf, &idx, &type, &size) < 0)
return -1;
if (type == ERL_SMALL_ATOM_UTF8_EXT || type == ERL_ATOM_UTF8_EXT ||
type == ERL_ATOM_EXT || type == ERL_SMALL_ATOM_EXT) {
char atom[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, atom) < 0)
return -1;
if (strcmp(atom, "proceed") == 0) {
/* Forward the :proceed frame to the child as
* the sentinel that ends its command loop. */
if (write_frame_fd(p2c_w, buf, (uint32_t)len) < 0)
return -1;
return 0;
}
if (strcmp(atom, "abort") == 0)
return 1;
return -1;
}
/* Tuple commands valid at the checkpoint:
* {:pty_winsize, _} -- applied in-agent
* {:cap_drop_bounding, _} -- forwarded to child
* {:cap_set_thread, _, _, _} -- forwarded to child
* {:cap_set_ambient, _} -- forwarded to child
* {:seccomp_install, <<bpf>>} -- forwarded to child */
if (type != ERL_SMALL_TUPLE_EXT && type != ERL_LARGE_TUPLE_EXT)
return -1;
int arity;
if (ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0)
return -1;
char tag[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0)
return -1;
/* Cap commands -- forward the frame verbatim. The child
* decodes and applies; failures come back on c2p as
* an {:error, errno, stage} ei frame, surfaced by the
* c2p poll branch above. */
if ((strcmp(tag, "cap_drop_bounding") == 0 && arity == 2) ||
(strcmp(tag, "cap_set_thread") == 0 && arity == 4) ||
(strcmp(tag, "cap_set_ambient") == 0 && arity == 2)) {
if (write_frame_fd(p2c_w, buf, (uint32_t)len) < 0)
return -1;
continue;
}
/* {:seccomp_install, <<bpf>>} -- S2 seccomp install. Same
* shape as the cap commands: forward verbatim and let the
* child do the per-thread `seccomp(SECCOMP_SET_MODE_FILTER)`
* call. Failures surface via the c2p poll branch with
* stage :seccomp_install or :seccomp_no_new_privs. */
if (strcmp(tag, "seccomp_install") == 0 && arity == 2) {
if (write_frame_fd(p2c_w, buf, (uint32_t)len) < 0)
return -1;
continue;
}
if (strcmp(tag, "pty_winsize") != 0 || arity != 2)
return -1;
int tarity;
if (ei_decode_tuple_header((const char *)buf, &idx, &tarity) < 0 ||
tarity != 4)
return -1;
long rows, cols, xpix, ypix;
if (ei_decode_long((const char *)buf, &idx, &rows) < 0 ||
ei_decode_long((const char *)buf, &idx, &cols) < 0 ||
ei_decode_long((const char *)buf, &idx, &xpix) < 0 ||
ei_decode_long((const char *)buf, &idx, &ypix) < 0)
return -1;
if (pty_master >= 0 &&
rows >= 0 && cols >= 0 && xpix >= 0 && ypix >= 0 &&
rows <= 0xFFFF && cols <= 0xFFFF &&
xpix <= 0xFFFF && ypix <= 0xFFFF) {
struct winsize ws = {
.ws_row = (unsigned short)rows,
.ws_col = (unsigned short)cols,
.ws_xpixel = (unsigned short)xpix,
.ws_ypixel = (unsigned short)ypix,
};
(void)ioctl(pty_master, TIOCSWINSZ, &ws);
}
/* Loop for the next command. */
}
}
enum post_running_cmd_kind {
CMD_NONE = 0,
CMD_SIGNAL,
CMD_PTY_IN,
CMD_PTY_WINSIZE,
};
struct post_running_cmd {
enum post_running_cmd_kind kind;
int signum; /* CMD_SIGNAL */
uint8_t *bytes; /* CMD_PTY_IN -- malloc'd; caller frees */
size_t bytes_len;
/* CMD_PTY_WINSIZE -- struct winsize is unsigned short per field;
* we store as unsigned so decode bounds-checks are clear. */
unsigned ws_rows, ws_cols, ws_xpix, ws_ypix;
};
/* Decode one {:packet, 4} ETF frame from the BEAM (post-:running):
* {:signal, n} -- forward to the workload
* {:pty_in, binary} -- write to the PTY master (PTY mode)
* {:pty_winsize, {r, c, xp, yp}} -- TIOCSWINSZ on the PTY master
*
* Returns:
* 0 on success (cmd filled)
* -1 on parse failure (cmd untouched; recoverable)
* -2 on EOF/IO error -- "BEAM went away"
* -3 on oversized frame (EMSGSIZE) -- the wire is now desynced
* because we consumed the 4-byte header but skipped the body;
* the caller must surface an error and tear down. */
static int read_post_running_command(struct post_running_cmd *cmd)
{
uint8_t buf[32768];
ssize_t len = read_frame(buf, sizeof buf);
if (len < 0)
return errno == EMSGSIZE ? -3 : -2;
int idx = 0, version;
if (ei_decode_version((const char *)buf, &idx, &version) < 0)
return -1;
int arity;
if (ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0 ||
arity != 2)
return -1;
char tag[MAXATOMLEN];
if (ei_decode_atom((const char *)buf, &idx, tag) < 0)
return -1;
if (strcmp(tag, "signal") == 0) {
long signum;
if (ei_decode_long((const char *)buf, &idx, &signum) < 0)
return -1;
if (signum <= 0 || signum > 64)
return -1;
cmd->kind = CMD_SIGNAL;
cmd->signum = (int)signum;
return 0;
}
if (strcmp(tag, "pty_in") == 0) {
int type, sz;
if (ei_get_type((const char *)buf, &idx, &type, &sz) < 0)
return -1;
if (type != ERL_BINARY_EXT)
return -1;
cmd->bytes = malloc((size_t)sz);
if (!cmd->bytes)
return -1;
long got;
if (ei_decode_binary((const char *)buf, &idx,
cmd->bytes, &got) < 0) {
free(cmd->bytes);
cmd->bytes = NULL;
return -1;
}
cmd->bytes_len = (size_t)got;
cmd->kind = CMD_PTY_IN;
return 0;
}
if (strcmp(tag, "pty_winsize") == 0) {
int tarity;
if (ei_decode_tuple_header((const char *)buf, &idx, &tarity) < 0 ||
tarity != 4)
return -1;
long rows, cols, xpix, ypix;
if (ei_decode_long((const char *)buf, &idx, &rows) < 0 ||
ei_decode_long((const char *)buf, &idx, &cols) < 0 ||
ei_decode_long((const char *)buf, &idx, &xpix) < 0 ||
ei_decode_long((const char *)buf, &idx, &ypix) < 0)
return -1;
if (rows < 0 || cols < 0 || xpix < 0 || ypix < 0 ||
rows > 0xFFFF || cols > 0xFFFF ||
xpix > 0xFFFF || ypix > 0xFFFF)
return -1;
cmd->kind = CMD_PTY_WINSIZE;
cmd->ws_rows = (unsigned)rows;
cmd->ws_cols = (unsigned)cols;
cmd->ws_xpix = (unsigned)xpix;
cmd->ws_ypix = (unsigned)ypix;
return 0;
}
return -1;
}
/* Emit {:pty_out, binary} on fd 4. */
static void emit_pty_out(const uint8_t *bytes, size_t n)
{
ei_x_buff x;
ei_x_new_with_version(&x);
ei_x_encode_tuple_header(&x, 2);
ei_x_encode_atom(&x, "pty_out");
ei_x_encode_binary(&x, bytes, (long)n);
emit_buff(&x);
}
/* The post-exec supervise loop. Dispatches three kinds of BEAM commands
* on CTL_IN -- {:signal, n} (forward to the workload), {:pty_in, binary}
* (write to the PTY master, PTY mode only), and {:pty_winsize, {r, c,
* xp, yp}} (TIOCSWINSZ on the master) -- reaps the workload via
* SIGCHLD-on-signalfd, and (in PTY mode) forwards bytes the workload
* writes to its terminal as {:pty_out, binary} on fd 4. Emits the
* terminal event ({:status, :exited, _} or {:status, :signaled, _})
* and returns when the child is gone.
*
* SIGCHLD is captured via signalfd (set up in main, blocked from normal
* delivery in the agent's signal mask); the child unblocks SIGCHLD again
* before execve, so the workload sees default semantics.
*
* `pty_master` is -1 when stdio is not :pty; otherwise it's the parent's
* end of the PTY pair created before clone/fork. */
static void supervise(pid_t child_pid, int sigfd, int pty_master)
{
struct pollfd pfds[3] = {
{ .fd = CTL_IN, .events = POLLIN },
{ .fd = sigfd, .events = POLLIN },
{ .fd = pty_master, .events = POLLIN }, /* fd = -1 when no PTY */
};
for (;;) {
int rc = poll(pfds, 3, -1);
if (rc < 0) {
if (errno == EINTR)
continue;
fprintf(stderr, "linx_process: poll: %s\n",
strerror(errno));
return;
}
/* BEAM command on fd 3: {:signal, n}, {:pty_in, bytes},
* or {:pty_winsize, {r, c, xp, yp}}. A POLLHUP on fd 3
* means the BEAM disappeared -- keep going so the child
* finishes naturally, but stop polling that side. */
if (pfds[0].revents & POLLIN) {
struct post_running_cmd cmd = { 0 };
int r = read_post_running_command(&cmd);
if (r == 0) {
switch (cmd.kind) {
case CMD_SIGNAL:
kill(child_pid, cmd.signum);
break;
case CMD_PTY_IN:
if (pty_master >= 0)
(void)write_exact(pty_master,
cmd.bytes,
cmd.bytes_len);
free(cmd.bytes);
break;
case CMD_PTY_WINSIZE:
if (pty_master >= 0) {
struct winsize ws = {
.ws_row = (unsigned short)cmd.ws_rows,
.ws_col = (unsigned short)cmd.ws_cols,
.ws_xpixel = (unsigned short)cmd.ws_xpix,
.ws_ypixel = (unsigned short)cmd.ws_ypix,
};
/* Best-effort: a stale fd or
* a kernel that rejects the
* value just gets ignored. */
(void)ioctl(pty_master,
TIOCSWINSZ, &ws);
}
break;
case CMD_NONE:
break;
}
} else if (r == -3) {
/* Oversized frame on fd 3 -- the wire is
* desynced (we ate the 4-byte header but
* skipped the body). Can't recover; surface
* a clean error, SIGKILL the workload so
* the session ends, and stop polling fd 3
* to avoid spinning on the desynced bytes.
* SIGCHLD will fire shortly, the supervise
* loop reaps, and main returns. */
emit_error(EMSGSIZE, "command_too_big");
kill(child_pid, SIGKILL);
pfds[0].fd = -1;
}
}
if (pfds[0].revents & (POLLHUP | POLLERR | POLLNVAL))
pfds[0].fd = -1; /* poll(2) ignores fd < 0 */
/* SIGCHLD fired. Drain the signalfd (SFD_NONBLOCK; the loop
* returns EAGAIN when empty) and waitpid the workload. */
if (pfds[1].revents & POLLIN) {
struct signalfd_siginfo si;
while (read(sigfd, &si, sizeof si) == sizeof si)
;
int status;
pid_t r = waitpid(child_pid, &status, WNOHANG);
if (r == child_pid) {
/* Workload exited. In PTY mode the master may
* still have buffered output the workload
* wrote just before exit; drain it before the
* terminal event so callers don't lose the
* final bytes. The master is O_NONBLOCK so
* the drain terminates on EAGAIN. */
if (pty_master >= 0) {
uint8_t buf[8192];
while (1) {
ssize_t n = read(pty_master,
buf, sizeof buf);
if (n > 0) {
emit_pty_out(buf, (size_t)n);
continue;
}
break;
}
}
if (WIFEXITED(status))
emit_status_int("exited",
WEXITSTATUS(status));
else if (WIFSIGNALED(status))
emit_status_int("signaled",
WTERMSIG(status));
return;
}
/* Spurious SIGCHLD (not our child, or already
* reaped). Ignore and keep polling. */
}
/* PTY master has bytes to read -- the workload wrote
* something. Forward as {:pty_out, binary}. EIO on a
* PTY master means the slave was closed (workload
* exited); waitpid picks the exit up via SIGCHLD, so
* we just stop polling the master. POLLHUP can arrive
* with buffered data still pending, so drain it too
* (the master is O_NONBLOCK, EAGAIN ends the drain). */
if (pty_master >= 0 &&
(pfds[2].revents & (POLLIN | POLLHUP))) {
uint8_t buf[8192];
while (1) {
ssize_t n = read(pty_master, buf, sizeof buf);
if (n > 0) {
emit_pty_out(buf, (size_t)n);
continue;
}
if (n < 0 && errno == EAGAIN)
break;
/* n == 0, or n < 0 with EIO / EBADF /
* EINTR-already-handled: peer closed. */
pfds[2].fd = -1;
break;
}
}
if (pty_master >= 0 &&
pfds[2].revents & (POLLERR | POLLNVAL))
pfds[2].fd = -1;
}
}
/* --- main -------------------------------------------------------------- */
int main(void)
{
/* Don't let a vanished BEAM kill us with SIGPIPE on a stale fd 4 --
* we'd rather see EPIPE from write() and drop out cleanly. */
signal(SIGPIPE, SIG_IGN);
/* Block SIGCHLD so signalfd can capture it (signals delivered the
* normal way bypass signalfd). The child unblocks SIGCHLD again
* before execve so the workload sees default semantics. */
sigset_t chld_mask;
sigemptyset(&chld_mask);
sigaddset(&chld_mask, SIGCHLD);
if (sigprocmask(SIG_BLOCK, &chld_mask, NULL) < 0) {
emit_error(errno, "sigprocmask");
return 4;
}
/* Read the spawn request. */
uint8_t req_buf[32768];
ssize_t req_len = read_frame(req_buf, sizeof req_buf);
if (req_len < 0) {
if (errno == EMSGSIZE) {
/* Frame exceeded our buffer cap (envs and argvs
* are the usual culprits at scale). Surface a
* clean structured error to the BEAM-side
* GenServer before bailing -- without this, the
* caller just sees the port close with no
* detail. */
emit_error(EMSGSIZE, "request_too_big");
} else {
fprintf(stderr, "linx_process: read spawn request: %s\n",
errno ? strerror(errno) : "eof");
}
return 1;
}
struct request req = { 0 };
if (decode_request(req_buf, (int)req_len, &req) < 0) {
/* The BEAM sent a {:spawn, _} / {:enter, _} we couldn't
* parse -- shape mismatch, missing required keys, invalid
* field types. Emit a structured error so the GenServer
* doesn't hang on the bare port close. */
emit_error(EINVAL, "malformed_request");
free_request(&req);
return 2;
}
/* If :env wasn't given, inherit the agent's. execve with a NULL envp
* is undefined; pass an empty list instead. We approximate "inherit"
* here by handing through our own environ -- the simplest semantics
* the BEAM-side caller will expect. */
extern char **environ;
char **child_env = req.env ? req.env : environ;
/* Two internal pipes for the checkpoint handshake. c2p uses CLOEXEC
* on the child end so a successful execve auto-closes it (the
* parent sees EOF and emits :running). */
int c2p[2], p2c[2];
if (pipe2(c2p, O_CLOEXEC) < 0 || pipe2(p2c, 0) < 0) {
emit_error(errno, "pipe2");
free_request(&req);
return 4;
}
struct child_args ca = {
.c2p_w = c2p[1],
.p2c_r = p2c[0],
.c2p_r = c2p[0],
.p2c_w = p2c[1],
.argv = req.argv,
.env = child_env,
.cwd = req.cwd,
.pty_master = -1,
.pty_slave = -1,
.no_new_privs = req.no_new_privs,
};
for (int i = 0; i < 3; i++)
ca.stdio[i] = req.stdio[i];
int pty_master = -1, pty_slave = -1;
if (req.pty) {
/* Create the PTY pair in the agent (parent) so it's inherited
* across clone/fork. The child closes the master and dups the
* slave onto 0/1/2; the parent closes the slave and shuttles
* bytes between the master and fd 4 in the supervise loop. */
pty_master = posix_openpt(O_RDWR | O_NOCTTY | O_CLOEXEC | O_NONBLOCK);
if (pty_master < 0) {
emit_error(errno, "posix_openpt");
free_request(&req);
return 4;
}
if (grantpt(pty_master) < 0 || unlockpt(pty_master) < 0) {
emit_error(errno, "ptsetup");
close(pty_master);
free_request(&req);
return 4;
}
/* ptsname(3) returns a pointer into a static buffer; copy out
* before any other call could clobber it. */
char slave_path[64];
const char *p = ptsname(pty_master);
if (!p) {
emit_error(errno, "ptsname");
close(pty_master);
free_request(&req);
return 4;
}
size_t plen = strlen(p);
if (plen >= sizeof slave_path) {
emit_error(ENAMETOOLONG, "ptsname");
close(pty_master);
free_request(&req);
return 4;
}
memcpy(slave_path, p, plen + 1);
pty_slave = open(slave_path, O_RDWR | O_NOCTTY);
if (pty_slave < 0) {
emit_error(errno, "pts_open");
close(pty_master);
free_request(&req);
return 4;
}
ca.pty_master = pty_master;
ca.pty_slave = pty_slave;
}
pid_t pid;
switch (req.mode) {
case MODE_SPAWN: {
/* CLONE_NEW* flags chosen by the request, OR'd with SIGCHLD so
* waitpid sees the child the way it does for fork(2). The
* child runs on its own private stack -- 1 MiB is ample for
* the work it does (no recursion, no large frames). Static and load-bearing: the
* agent clones once per process lifetime, so this buffer is never
* reused; a second spawn would clobber it. Keep the agent single-shot. */
static char child_stack[CHILD_STACK_SIZE];
int flags = req.ns_flags | SIGCHLD;
pid = clone(child_fn, child_stack + CHILD_STACK_SIZE, flags, &ca);
if (pid < 0) {
emit_error(errno, "clone");
free_request(&req);
return 3;
}
break;
}
case MODE_ENTER: {
/* Join the target's namespaces *in the agent* before forking
* -- so the fork's child is born inside them. setns is per
* thread, the agent is single-threaded, and PID-namespace
* setns only takes effect on subsequent forks. */
if (enter_target_namespaces(&req) < 0) {
free_request(&req);
return 3;
}
pid = fork();
if (pid < 0) {
emit_error(errno, "fork");
free_request(&req);
return 3;
}
if (pid == 0) {
/* Child: reuse the same checkpoint+execve logic
* the cloned-child path runs. child_fn does its
* own fd hygiene (closing the parent ends of our
* internal pipes) on entry, so we don't need to
* touch them here. */
child_fn(&ca);
_exit(127); /* unreachable */
}
break;
}
}
/* Close the child's ends of the internal pipes in the parent. The
* child end of c2p is already CLOEXEC-closed at exec time too -- the
* close here is the parent's copy. */
close(c2p[1]);
close(p2c[0]);
/* In PTY mode, the slave is for the child only; the parent keeps the
* master. Closing the slave here removes the agent's extra reference;
* once the child also closes it (via dup2 onto 0/1/2 + close), the
* slave end goes away when the workload exits, triggering EIO on
* the master so the supervise loop notices. */
if (pty_slave >= 0)
close(pty_slave);
emit_status_int("spawned", (long)pid);
/* Read the child's first message -- expected: a {:ready, pid}
* ei frame on c2p. */
long child_pid;
{
uint8_t buf[256];
ssize_t len = read_frame_fd(c2p[0], buf, sizeof buf);
if (len < 0) {
/* Child died before sending :ready, or the c2p
* pipe broke. Surface errno (or EIO on EOF). */
emit_error(errno ? errno : EIO, "ready_frame");
free_request(&req);
return 4;
}
int idx = 0, version, arity;
char tag[MAXATOMLEN];
if (ei_decode_version((const char *)buf, &idx, &version) < 0 ||
ei_decode_tuple_header((const char *)buf, &idx, &arity) < 0 ||
arity != 2 ||
ei_decode_atom((const char *)buf, &idx, tag) < 0 ||
strcmp(tag, "ready") != 0 ||
ei_decode_long((const char *)buf, &idx, &child_pid) < 0) {
emit_error(EPROTO, "malformed_ready");
free_request(&req);
return 4;
}
}
emit_status_int("ready", child_pid);
/* Wait for :proceed (or :abort) from the BEAM. await_proceed
* forwards both :proceed and any K2 cap_* commands to the child
* via p2c[1] as ei frames. On :abort we close p2c[1] so the
* child sees EOF and _exits without execve'ing, then we reap and
* emit {:status, :aborted, ...}.
*
* A negative return here is most commonly EOF on fd 3 -- the BEAM
* port closing because its owning GenServer died, which is a routine
* cleanup path (not an error worth a stderr line). */
int decision = await_proceed(pty_master, p2c[1], c2p[0]);
if (decision < 0) {
free_request(&req);
return 1;
}
if (decision == 1) {
/* :abort -- the child is parked reading p2c[0]. Closing our
* write end without sending a :proceed frame gives it EOF;
* child _exits 102 (see child_fn / child_read_command). */
close(p2c[1]);
close(c2p[0]);
int status;
if (waitpid(pid, &status, 0) < 0) {
fprintf(stderr,
"linx_process: waitpid after abort: %s\n",
strerror(errno));
}
/* Emit the same shape as :ready -- {:status, :aborted, pid}.
* `child_pid` is the pidns-internal pid the child sent us
* earlier (matches what we delivered with :ready). */
emit_status_int("aborted", child_pid);
if (pty_master >= 0)
close(pty_master);
free_request(&req);
return 0;
}
/* :proceed was forwarded inside await_proceed. Close our write end
* so the child won't block on a subsequent read if anything else
* shows up on fd 3 -- it will execve from where it is. */
close(p2c[1]);
/* The child either execve's successfully (the c2p pipe closes on
* exec via CLOEXEC, we see EOF) or fails before exec (an
* {:error, errno, stage} frame). */
int outcome = await_exec_outcome(c2p[0]);
close(c2p[0]);
if (outcome == 1) {
/* :error already emitted. Reap the child to avoid a
* zombie, then exit. */
int status;
waitpid(pid, &status, 0);
free_request(&req);
return 0;
}
if (outcome < 0) {
/* await_exec_outcome failed to read either an :error
* frame or a clean EOF from c2p. Rare -- usually means
* the child died in a way that left the pipe broken. */
emit_error(EIO, "exec_outcome");
free_request(&req);
return 4;
}
emit_status_running();
/* Capture SIGCHLD via signalfd so the supervise loop can multiplex
* it against fd 3 in a single poll(). The mask was blocked early
* in main. Non-blocking so the drain loop in supervise() terminates
* rather than hanging on a quiet signalfd. */
int sigfd = signalfd(-1, &chld_mask, SFD_CLOEXEC | SFD_NONBLOCK);
if (sigfd < 0) {
emit_error(errno, "signalfd");
free_request(&req);
return 4;
}
supervise(pid, sigfd, pty_master);
close(sigfd);
if (pty_master >= 0)
close(pty_master);
free_request(&req);
return 0;
}