/*
* linx_sysctl -- the NIF backing `Linx.Sysctl`'s cross-namespace
* verbs.
*
* The host-side (caller's namespace) read/write/list path stays in
* pure Elixir using File.read/1 + File.write/2 + File.ls/1 over
* /proc/sys/. This NIF is only invoked when the caller passed an
* `:in` option naming a different process's namespaces.
*
* Three operations, all sharing one shape:
*
* - read_in_ns/2 -- read the file at PATH, return its bytes
* (untrimmed; the Elixir layer trims).
* - write_in_ns/3 -- write DATA to the file at PATH.
* - list_in_ns/2 -- recursively walk the directory tree at ROOT,
* returning a list of {path_binary,
* value_binary} tuples for every readable
* regular file. Unreadable files are silently
* skipped (matches the pure-Elixir list/0
* behaviour).
*
* NAMESPACE TARGETING
* -------------------
* Every operation takes an `ns_paths` argument: a list of binaries,
* each naming a `/proc/<pid>/ns/<kind>` file (typically the full
* stack: user, mount, UTS, IPC, net). The NIF opens every fd FIRST
* (in the BEAM's own namespace, so the paths resolve correctly),
* then spawns a throwaway pthread that:
*
* 1. unshare(CLONE_FS) -- detach this thread's fs_struct
* from the BEAM's. Required before
* setns(CLONE_NEWNS); see the long
* comment in c_src/linx_mount.c.
* 2. setns(fd, 0) for each -- 0 lets the kernel autodetect the
* ns fd in order. namespace type from the file.
* 3. performs the I/O.
* 4. exits the thread.
*
* setns(2) operates per-thread, so the BEAM's own scheduler threads
* never enter the target namespace -- the throwaway thread's
* namespace membership is destroyed when it exits.
*
* The Elixir layer always passes the namespaces in the canonical
* order user -> mount -> UTS -> IPC -> net so a future unprivileged
* caller works the same way as today's privileged BEAM (user ns
* first means CAP_SYS_ADMIN is granted in that ns before we try to
* enter mount).
*
* ERROR SHAPE
* -----------
* Every operation returns `:ok` (write), `{:ok, binary}` (read), or
* `{:ok, [tuple]}` (list) on success, or
* `{:error, {stage::atom, errno_atom | errno_int}}` on failure.
* Stages:
*
* - :open_ns -- couldn't open one of the ns paths.
* - :unshare -- unshare(CLONE_FS) failed (vanishingly rare).
* - :setns -- couldn't enter one of the namespaces (typically
* EPERM in the rootless case).
* - :thread -- couldn't create the worker thread.
* - :read -- the open() or read() inside the target ns failed.
* - :write -- the open() or write() inside the target ns failed.
* - :list -- the opendir() of the root failed inside the
* target ns. Per-file failures during the walk are
* skipped silently, not returned as :list errors.
*
* Common Linux errnos are mapped to POSIX-style atoms; any errno
* not in the table falls back to the raw integer.
*/
#include <erl_nif.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <sched.h> /* setns, CLONE_FS */
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#define LINX_SYSCTL_VERSION "linx_sysctl"
/* Per-file read cap. Sysctl files are bounded by the kernel; the
* largest knobs we know of (lists of registered congestion-control
* algorithms, available kernel symbols, etc.) stay well under 16K. */
#define LINX_SYSCTL_READ_MAX 65536
/* --- errno -> atom ------------------------------------------------------- */
static const char *errno_atom(int err)
{
switch (err) {
case EACCES: return "eacces";
case EAGAIN: return "eagain";
case EBADF: return "ebadf";
case EBUSY: return "ebusy";
case EEXIST: return "eexist";
case EFAULT: return "efault";
case EFBIG: return "efbig";
case EINVAL: return "einval";
case EIO: return "eio";
case EISDIR: return "eisdir";
case ELOOP: return "eloop";
case EMFILE: return "emfile";
case ENAMETOOLONG: return "enametoolong";
case ENODEV: return "enodev";
case ENOENT: return "enoent";
case ENOMEM: return "enomem";
case ENOSPC: return "enospc";
case ENOSYS: return "enosys";
case ENOTDIR: return "enotdir";
case EOPNOTSUPP: return "eopnotsupp";
case EPERM: return "eperm";
case ERANGE: return "erange";
case EROFS: return "erofs";
case ESRCH: return "esrch";
default: return NULL;
}
}
/* Build {error, {Stage::atom, ErrnoAtom | ErrnoInt}}. */
static ERL_NIF_TERM make_error(ErlNifEnv *env, const char *stage, int err)
{
const char *name = errno_atom(err);
ERL_NIF_TERM val = name
? enif_make_atom(env, name)
: enif_make_int(env, err);
return enif_make_tuple2(
env, enif_make_atom(env, "error"),
enif_make_tuple2(env, enif_make_atom(env, stage), val));
}
static ERL_NIF_TERM ok_atom(ErlNifEnv *env)
{
return enif_make_atom(env, "ok");
}
/* --- input parsing ------------------------------------------------------- */
/* Copy an Elixir binary into a freshly-allocated null-terminated C
* string. Caller frees with `enif_free`. Returns NULL on allocation
* failure or if the term isn't a binary. */
static char *binary_to_cstr(ErlNifEnv *env, ERL_NIF_TERM term)
{
ErlNifBinary bin;
if (!enif_inspect_binary(env, term, &bin))
return NULL;
char *s = enif_alloc(bin.size + 1);
if (!s)
return NULL;
memcpy(s, bin.data, bin.size);
s[bin.size] = '\0';
return s;
}
/* Convert an Elixir list of binaries into a heap-allocated C array
* of null-terminated strings. Sets *out_n on success. Returns NULL
* on any failure; on failure no allocations are leaked. */
static char **list_to_cstr_array(ErlNifEnv *env, ERL_NIF_TERM list, int *out_n)
{
unsigned length;
if (!enif_get_list_length(env, list, &length))
return NULL;
/* Bound the length (the multiply below could overflow) and treat an
* empty list as a valid zero-element array: enif_alloc(0) may return
* NULL, which the caller would misread as failure. */
if (length > 4096)
return NULL;
char **arr = enif_alloc((length ? length : 1) * sizeof(char *));
if (!arr)
return NULL;
ERL_NIF_TERM head;
ERL_NIF_TERM tail = list;
unsigned i = 0;
while (enif_get_list_cell(env, tail, &head, &tail)) {
arr[i] = binary_to_cstr(env, head);
if (!arr[i]) {
for (unsigned j = 0; j < i; j++)
enif_free(arr[j]);
enif_free(arr);
return NULL;
}
i++;
}
*out_n = (int)length;
return arr;
}
static void free_cstr_array(char **arr, int n)
{
for (int i = 0; i < n; i++)
enif_free(arr[i]);
enif_free(arr);
}
/* --- the setns dance ----------------------------------------------------- */
/* Result-channel struct shared by every worker. */
struct ns_job_result {
int err; /* errno from the failing step, or 0 */
const char *stage; /* "open_ns" | "unshare" | "setns" | op-specific */
};
/* Open every ns_path in the BEAM's namespace and stash the fds in
* out_fds (which must be sized for `n` entries). On failure, closes
* any already-opened fds, sets r->{err,stage}, returns -1. */
static int open_ns_fds(struct ns_job_result *r, char **ns_paths, int n, int *out_fds)
{
for (int i = 0; i < n; i++) {
int fd = open(ns_paths[i], O_RDONLY | O_CLOEXEC);
if (fd < 0) {
r->err = errno;
r->stage = "open_ns";
for (int j = 0; j < i; j++)
close(out_fds[j]);
return -1;
}
out_fds[i] = fd;
}
return 0;
}
/* Per the long comment in linx_mount.c: setns(CLONE_NEWNS) refuses
* if the caller's fs_struct is shared. unshare(CLONE_FS) gives this
* thread its own fs_struct; the thread is about to exit so the
* unshare is discarded with it.
*
* Then setns each fd in order. The 0 in `setns(fd, 0)` means "let
* the kernel infer the namespace type from the file" -- works for
* every /proc/<pid>/ns/<kind> file. */
static int enter_ns_stack(struct ns_job_result *r, int *fds, int n)
{
if (unshare(CLONE_FS) < 0) {
r->err = errno;
r->stage = "unshare";
return -1;
}
for (int i = 0; i < n; i++) {
if (setns(fds[i], 0) < 0) {
r->err = errno;
r->stage = "setns";
return -1;
}
}
return 0;
}
/* --- file I/O (inside the target ns) ------------------------------------- */
/* Read /proc/sys/... into a freshly-allocated buffer. Caller frees
* with enif_free. Returns 0 on success, errno on failure (with
* *out_buf left NULL). Caps reads at LINX_SYSCTL_READ_MAX. */
static int read_proc_file(const char *path, char **out_buf, size_t *out_len)
{
*out_buf = NULL;
*out_len = 0;
int fd = open(path, O_RDONLY | O_CLOEXEC);
if (fd < 0)
return errno;
size_t cap = 4096;
char *buf = enif_alloc(cap);
if (!buf) {
close(fd);
return ENOMEM;
}
size_t len = 0;
for (;;) {
if (len == cap) {
if (cap >= LINX_SYSCTL_READ_MAX)
break;
size_t new_cap = cap * 2;
if (new_cap > LINX_SYSCTL_READ_MAX)
new_cap = LINX_SYSCTL_READ_MAX;
char *grown = enif_realloc(buf, new_cap);
if (!grown) {
enif_free(buf);
close(fd);
return ENOMEM;
}
buf = grown;
cap = new_cap;
}
ssize_t n = read(fd, buf + len, cap - len);
if (n < 0) {
if (errno == EINTR)
continue;
int e = errno;
enif_free(buf);
close(fd);
return e;
}
if (n == 0)
break;
len += (size_t)n;
}
close(fd);
*out_buf = buf;
*out_len = len;
return 0;
}
/* Write data to /proc/sys/... in one go. Returns 0 on success,
* errno on failure. */
static int write_proc_file(const char *path, const char *data, size_t len)
{
int fd = open(path, O_WRONLY | O_CLOEXEC);
if (fd < 0)
return errno;
size_t off = 0;
while (off < len) {
ssize_t n = write(fd, data + off, len - off);
if (n < 0) {
if (errno == EINTR)
continue;
int e = errno;
close(fd);
return e;
}
off += (size_t)n;
}
close(fd);
return 0;
}
/* --- read_in_ns/2 worker ------------------------------------------------- */
struct read_job {
struct ns_job_result r;
/* in */
const char *path;
char **ns_paths;
int n_ns;
/* out */
char *buf;
size_t buf_len;
};
static void *read_worker(void *arg)
{
struct read_job *j = arg;
int *fds = enif_alloc((j->n_ns ? j->n_ns : 1) * sizeof(int));
if (!fds) {
j->r.err = ENOMEM;
j->r.stage = "open_ns";
return NULL;
}
if (open_ns_fds(&j->r, j->ns_paths, j->n_ns, fds) < 0) {
enif_free(fds);
return NULL;
}
if (enter_ns_stack(&j->r, fds, j->n_ns) < 0)
goto cleanup;
int e = read_proc_file(j->path, &j->buf, &j->buf_len);
if (e != 0) {
j->r.err = e;
j->r.stage = "read";
}
cleanup:
for (int i = 0; i < j->n_ns; i++)
close(fds[i]);
enif_free(fds);
return NULL;
}
/* Args: path (binary), ns_paths (list of binaries). */
static ERL_NIF_TERM nif_read(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
(void)argc;
char *path = binary_to_cstr(env, argv[0]);
if (!path)
return enif_make_badarg(env);
int n_ns = 0;
char **ns_paths = list_to_cstr_array(env, argv[1], &n_ns);
if (!ns_paths) {
enif_free(path);
return enif_make_badarg(env);
}
struct read_job job = {
.r = { .err = 0, .stage = NULL },
.path = path,
.ns_paths = ns_paths,
.n_ns = n_ns,
.buf = NULL,
.buf_len = 0,
};
ERL_NIF_TERM result;
ErlNifTid tid;
int rc = enif_thread_create("linx_sysctl_read", &tid, read_worker, &job, NULL);
if (rc != 0) {
result = make_error(env, "thread", rc);
} else {
enif_thread_join(tid, NULL);
if (job.r.err) {
result = make_error(env, job.r.stage, job.r.err);
} else {
ErlNifBinary bin;
if (!enif_alloc_binary(job.buf_len, &bin)) {
result = make_error(env, "read", ENOMEM);
} else {
memcpy(bin.data, job.buf, job.buf_len);
result = enif_make_tuple2(env, ok_atom(env), enif_make_binary(env, &bin));
}
}
}
if (job.buf)
enif_free(job.buf);
enif_free(path);
free_cstr_array(ns_paths, n_ns);
return result;
}
/* --- write_in_ns/3 worker ------------------------------------------------ */
struct write_job {
struct ns_job_result r;
/* in */
const char *path;
const char *data;
size_t data_len;
char **ns_paths;
int n_ns;
};
static void *write_worker(void *arg)
{
struct write_job *j = arg;
int *fds = enif_alloc((j->n_ns ? j->n_ns : 1) * sizeof(int));
if (!fds) {
j->r.err = ENOMEM;
j->r.stage = "open_ns";
return NULL;
}
if (open_ns_fds(&j->r, j->ns_paths, j->n_ns, fds) < 0) {
enif_free(fds);
return NULL;
}
if (enter_ns_stack(&j->r, fds, j->n_ns) < 0)
goto cleanup;
int e = write_proc_file(j->path, j->data, j->data_len);
if (e != 0) {
j->r.err = e;
j->r.stage = "write";
}
cleanup:
for (int i = 0; i < j->n_ns; i++)
close(fds[i]);
enif_free(fds);
return NULL;
}
/* Args: path (binary), data (binary), ns_paths (list of binaries). */
static ERL_NIF_TERM nif_write(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
(void)argc;
ErlNifBinary data_bin;
if (!enif_inspect_binary(env, argv[1], &data_bin))
return enif_make_badarg(env);
char *path = binary_to_cstr(env, argv[0]);
if (!path)
return enif_make_badarg(env);
int n_ns = 0;
char **ns_paths = list_to_cstr_array(env, argv[2], &n_ns);
if (!ns_paths) {
enif_free(path);
return enif_make_badarg(env);
}
struct write_job job = {
.r = { .err = 0, .stage = NULL },
.path = path,
.data = (const char *)data_bin.data,
.data_len = data_bin.size,
.ns_paths = ns_paths,
.n_ns = n_ns,
};
ERL_NIF_TERM result;
ErlNifTid tid;
int rc = enif_thread_create("linx_sysctl_write", &tid, write_worker, &job, NULL);
if (rc != 0)
result = make_error(env, "thread", rc);
else {
enif_thread_join(tid, NULL);
result = job.r.err
? make_error(env, job.r.stage, job.r.err)
: ok_atom(env);
}
enif_free(path);
free_cstr_array(ns_paths, n_ns);
return result;
}
/* --- list_in_ns/2 worker ------------------------------------------------- */
/* Linked-list node for a discovered entry. The walker accumulates
* these; the NIF caller converts them into an Elixir list and frees
* the chain. */
struct list_node {
char *path; /* enif_alloc'd, full /proc/sys/... */
char *value; /* enif_alloc'd, raw bytes (untrimmed) */
size_t value_len;
struct list_node *next;
};
static void free_list_nodes(struct list_node *head)
{
while (head) {
struct list_node *next = head->next;
enif_free(head->path);
enif_free(head->value);
enif_free(head);
head = next;
}
}
/* Recursive walker. `buf` is a writable scratch buffer of at least
* PATH_MAX bytes containing the current path at offset 0..len-1
* (NUL-terminated at [len]). Appends entries to *head. Silently
* skips unreadable directories and unreadable files (matches the
* Elixir-side walker behaviour). */
static void walk_dir(char *buf, size_t len, size_t cap, struct list_node **head,
unsigned depth)
{
/* /proc/sys is shallow; cap recursion so a pathological tree cannot
* blow the worker thread stack. */
if (depth > 32)
return;
DIR *d = opendir(buf);
if (!d)
return;
struct dirent *e;
while ((e = readdir(d)) != NULL) {
const char *name = e->d_name;
if (name[0] == '.' &&
(name[1] == '\0' || (name[1] == '.' && name[2] == '\0')))
continue;
size_t name_len = strlen(name);
/* len + '/' + name + NUL */
if (len + 1 + name_len + 1 > cap)
continue;
buf[len] = '/';
memcpy(buf + len + 1, name, name_len);
size_t new_len = len + 1 + name_len;
buf[new_len] = '\0';
struct stat st;
if (stat(buf, &st) == 0) {
if (S_ISDIR(st.st_mode)) {
walk_dir(buf, new_len, cap, head, depth + 1);
} else if (S_ISREG(st.st_mode)) {
char *value = NULL;
size_t value_len = 0;
if (read_proc_file(buf, &value, &value_len) == 0) {
struct list_node *node = enif_alloc(sizeof(*node));
char *path_copy = enif_alloc(new_len + 1);
if (node && path_copy) {
memcpy(path_copy, buf, new_len + 1);
node->path = path_copy;
node->value = value;
node->value_len = value_len;
node->next = *head;
*head = node;
} else {
/* Allocation failure: drop this entry,
* keep walking. The list result is "best
* effort" already. */
if (node)
enif_free(node);
if (path_copy)
enif_free(path_copy);
enif_free(value);
}
}
/* read_proc_file failure: silent skip. */
}
}
buf[len] = '\0';
}
closedir(d);
}
struct list_job {
struct ns_job_result r;
/* in */
const char *root;
char **ns_paths;
int n_ns;
/* out */
struct list_node *entries;
};
static void *list_worker(void *arg)
{
struct list_job *j = arg;
int *fds = enif_alloc((j->n_ns ? j->n_ns : 1) * sizeof(int));
if (!fds) {
j->r.err = ENOMEM;
j->r.stage = "open_ns";
return NULL;
}
if (open_ns_fds(&j->r, j->ns_paths, j->n_ns, fds) < 0) {
enif_free(fds);
return NULL;
}
if (enter_ns_stack(&j->r, fds, j->n_ns) < 0)
goto cleanup;
/* Confirm the root opendir's before walking, so the caller
* gets an :enoent error for a non-existent prefix instead of
* silently returning []. */
DIR *d = opendir(j->root);
if (!d) {
j->r.err = errno;
j->r.stage = "list";
goto cleanup;
}
closedir(d);
char buf[PATH_MAX];
size_t root_len = strlen(j->root);
if (root_len >= sizeof(buf)) {
j->r.err = ENAMETOOLONG;
j->r.stage = "list";
goto cleanup;
}
memcpy(buf, j->root, root_len + 1);
walk_dir(buf, root_len, sizeof(buf), &j->entries, 0);
cleanup:
for (int i = 0; i < j->n_ns; i++)
close(fds[i]);
enif_free(fds);
return NULL;
}
/* Args: root_path (binary), ns_paths (list of binaries). */
static ERL_NIF_TERM nif_list(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
(void)argc;
char *root = binary_to_cstr(env, argv[0]);
if (!root)
return enif_make_badarg(env);
int n_ns = 0;
char **ns_paths = list_to_cstr_array(env, argv[1], &n_ns);
if (!ns_paths) {
enif_free(root);
return enif_make_badarg(env);
}
struct list_job job = {
.r = { .err = 0, .stage = NULL },
.root = root,
.ns_paths = ns_paths,
.n_ns = n_ns,
.entries = NULL,
};
ERL_NIF_TERM result;
ErlNifTid tid;
int rc = enif_thread_create("linx_sysctl_list", &tid, list_worker, &job, NULL);
if (rc != 0) {
result = make_error(env, "thread", rc);
} else {
enif_thread_join(tid, NULL);
if (job.r.err) {
result = make_error(env, job.r.stage, job.r.err);
} else {
result = enif_make_list(env, 0);
for (struct list_node *n = job.entries; n; n = n->next) {
ErlNifBinary path_bin;
size_t plen = strlen(n->path);
if (!enif_alloc_binary(plen, &path_bin)) {
result = make_error(env, "list", ENOMEM);
break;
}
memcpy(path_bin.data, n->path, plen);
ErlNifBinary value_bin;
if (!enif_alloc_binary(n->value_len, &value_bin)) {
enif_release_binary(&path_bin);
result = make_error(env, "list", ENOMEM);
break;
}
memcpy(value_bin.data, n->value, n->value_len);
ERL_NIF_TERM tuple = enif_make_tuple2(
env,
enif_make_binary(env, &path_bin),
enif_make_binary(env, &value_bin));
result = enif_make_list_cell(env, tuple, result);
}
if (!job.r.err) {
/* Wrap successful list in {:ok, list}. */
result = enif_make_tuple2(env, ok_atom(env), result);
}
}
}
free_list_nodes(job.entries);
enif_free(root);
free_cstr_array(ns_paths, n_ns);
return result;
}
/* --- version/0 ----------------------------------------------------------- */
static ERL_NIF_TERM version(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
(void)argc;
(void)argv;
return enif_make_string(env, LINX_SYSCTL_VERSION, ERL_NIF_LATIN1);
}
/* --- NIF init ------------------------------------------------------------ */
/* All three operations spawn a thread + do file I/O against procfs,
* so they're dirty-I/O. version/0 stays on a normal scheduler. */
static ErlNifFunc nif_funcs[] = {
{ "version", 0, version, 0 },
{ "read_in_ns", 2, nif_read, ERL_NIF_DIRTY_JOB_IO_BOUND },
{ "write_in_ns", 3, nif_write, ERL_NIF_DIRTY_JOB_IO_BOUND },
{ "list_in_ns", 2, nif_list, ERL_NIF_DIRTY_JOB_IO_BOUND },
};
ERL_NIF_INIT(Elixir.Linx.Sysctl.Native, nif_funcs, NULL, NULL, NULL, NULL)