c_src/linx_mount.c

Select File
/*
 * linx_mount -- the NIF backing `Linx.Mount`.
 *
 * Wraps the three filesystem-mount syscalls:
 *
 *   - mount(2)        -- mount/6
 *   - umount2(2)      -- umount/3
 *   - pivot_root(2)   -- pivot_root/4 (M4, not in this build)
 *
 * NAMESPACE TARGETING
 * -------------------
 * Each fallible function takes an `ns_path` binary argument:
 *
 *   - empty binary -- perform the syscall in the caller's namespace
 *     (the BEAM's mount namespace). No thread is spawned.
 *   - non-empty -- a path to a namespace file (typically
 *     `/proc/<pid>/ns/mnt`). The NIF spawns a throwaway pthread,
 *     opens the path with O_RDONLY|O_CLOEXEC, `setns(fd, CLONE_NEWNS)`s
 *     into it, performs the syscall there, and exits the thread.
 *     setns(2) operates per-thread, so the BEAM's own scheduler
 *     threads never enter the target namespace -- the throwaway
 *     thread's namespace membership is destroyed when it exits.
 *
 * Same throwaway-thread pattern as `c_src/netlink_socket.c`'s
 * `open_in_netns` -- both use the same trick to perform a
 * per-thread namespace switch without contaminating the BEAM.
 *
 * ERROR SHAPE
 * -----------
 * Every fallible function returns either `:ok` or
 * `{:error, {Stage::atom, ErrnoAtom | ErrnoInt}}`. Stages:
 *
 *   - :mount / :umount / :pivot_root -- the target syscall failed.
 *   - :open_ns -- couldn't open the namespace file.
 *   - :setns   -- couldn't enter the target namespace.
 *   - :thread  -- couldn't create the worker thread.
 *
 * Common Linux errnos are mapped to POSIX-style atoms; any errno
 * not in the table falls back to the raw integer.
 */

#include <erl_nif.h>

#include <errno.h>
#include <fcntl.h>
#include <sched.h>      /* setns, CLONE_NEWNS */
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/syscall.h> /* SYS_pivot_root (no glibc wrapper) */
#include <sys/wait.h>    /* waitpid -- reap the pidns mount fork */
#include <unistd.h>

#define LINX_MOUNT_VERSION "linx_mount"

/* --- errno -> atom ------------------------------------------------------- */

static const char *errno_atom(int err)
{
	switch (err) {
	case EACCES:       return "eacces";
	case EAGAIN:       return "eagain";
	case EBADF:        return "ebadf";
	case EBUSY:        return "ebusy";
	case EFAULT:       return "efault";
	case EINVAL:       return "einval";
	case ELOOP:        return "eloop";
	case EMFILE:       return "emfile";
	case ENAMETOOLONG: return "enametoolong";
	case ENODEV:       return "enodev";
	case ENOENT:       return "enoent";
	case ENOMEM:       return "enomem";
	case ENOTBLK:      return "enotblk";
	case ENOTDIR:      return "enotdir";
	case ENXIO:        return "enxio";
	case EOPNOTSUPP:   return "eopnotsupp";
	case EPERM:        return "eperm";
	case EROFS:        return "erofs";
	case ESRCH:        return "esrch";
	default:           return NULL;
	}
}

/* Build {error, {Stage::atom, ErrnoAtom | ErrnoInt}}. */
static ERL_NIF_TERM make_error(ErlNifEnv *env, const char *stage, int err)
{
	const char *name = errno_atom(err);
	ERL_NIF_TERM val = name
		? enif_make_atom(env, name)
		: enif_make_int(env, err);
	return enif_make_tuple2(
		env, enif_make_atom(env, "error"),
		enif_make_tuple2(env, enif_make_atom(env, stage), val));
}

static ERL_NIF_TERM ok_atom(ErlNifEnv *env)
{
	return enif_make_atom(env, "ok");
}

/* --- string args -------------------------------------------------------- */

/* Copy an Elixir binary into a freshly-allocated null-terminated C
 * string. Caller frees with `enif_free`. Returns NULL on allocation
 * failure or if the term isn't a binary. */
static char *binary_to_cstr(ErlNifEnv *env, ERL_NIF_TERM term)
{
	ErlNifBinary bin;
	if (!enif_inspect_binary(env, term, &bin))
		return NULL;

	char *s = enif_alloc(bin.size + 1);
	if (!s)
		return NULL;

	memcpy(s, bin.data, bin.size);
	s[bin.size] = '\0';
	return s;
}

/* --- version/0 ---------------------------------------------------------- */

static ERL_NIF_TERM version(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
	(void)argc;
	(void)argv;
	return enif_make_string(env, LINX_MOUNT_VERSION, ERL_NIF_LATIN1);
}

/* --- worker job structs ------------------------------------------------- */

/* Both mount and umount workers share this header. Specific args follow
 * via the dedicated structs. */
struct ns_job_result {
	int err;            /* errno from the failing step, or 0 */
	const char *stage;  /* "open_ns" | "setns" | "mount" | "umount" */
};

struct mount_job {
	struct ns_job_result r;
	const char *ns_path;
	const char *source;     /* NULL if not specified */
	const char *target;
	const char *fstype;     /* NULL if not specified */
	unsigned long flags;
	const char *data;       /* NULL if not specified */
	const char *pidns_path; /* non-empty => fork into this pid ns to mount (procfs) */
	int create_target;      /* create an empty file at target (in-ns) before mounting */
};

struct umount_job {
	struct ns_job_result r;
	const char *ns_path;
	const char *target;
	int flags;
};

struct pivot_root_job {
	struct ns_job_result r;
	const char *ns_path;   /* empty == BEAM ns */
	const char *new_root;
	const char *put_old;
};

/* Common setns enter/exit pattern used by mount/umount workers.
 * Returns the opened ns fd (caller closes after the syscall), or
 * -1 on failure (with j->r.{err,stage} set).
 *
 * CRITICAL: the kernel's mntns_install() refuses setns(CLONE_NEWNS)
 * if the calling thread's fs_struct is shared with any other thread
 * -- specifically, it returns EINVAL when `fs->users != 1`. Every
 * scheduler thread in the BEAM shares one fs_struct, so a naked
 * setns from a throwaway thread fails. The fix is to call
 * unshare(CLONE_FS) first to give this thread its own fs_struct,
 * separating it from the rest of the BEAM. The unshare only affects
 * the calling thread; when the thread exits, its fs_struct is
 * discarded. Same trick `nsenter(1)` and similar userspace tools use
 * when they need to enter another mount namespace.
 *
 * The order matters: unshare BEFORE setns, otherwise the open() of
 * the namespace file might succeed via the BEAM's fs_struct but the
 * setns then refuses. */
static int enter_target_ns(struct ns_job_result *r, const char *ns_path)
{
	if (unshare(CLONE_FS) < 0) {
		r->err = errno;
		r->stage = "unshare";
		return -1;
	}

	int ns = open(ns_path, O_RDONLY | O_CLOEXEC);
	if (ns < 0) {
		r->err = errno;
		r->stage = "open_ns";
		return -1;
	}

	if (setns(ns, CLONE_NEWNS) < 0) {
		r->err = errno;
		r->stage = "setns";
		close(ns);
		return -1;
	}

	return ns;
}

/* Create an empty regular file at `target` if it doesn't already
 * exist -- a placeholder for a device-node bind mount onto a fresh
 * tmpfs (e.g. /dev/null). Runs on the worker thread, which has
 * already entered the target mount namespace, so the file lands on
 * the in-container tmpfs (a host-side creat would land on the dir
 * underneath the tmpfs, invisible in the container). Returns 0 on
 * success or if it already exists, otherwise an errno. */
static int ensure_target_file(const char *target)
{
	if (access(target, F_OK) == 0)
		return 0;

	int fd = open(target, O_CREAT | O_WRONLY | O_CLOEXEC, 0644);
	if (fd < 0)
		return errno;

	close(fd);
	return 0;
}

/* Mount from inside the target PID namespace. setns(CLONE_NEWPID)
 * only arms the namespace for *future children*, and procfs binds to
 * the mounting task's active pid namespace -- so to get a /proc that
 * reflects the container's pids we must fork() and let the child do
 * the mount. The worker thread has already entered the target mount
 * namespace, so the fork child inherits it.
 *
 * ASYNC-SIGNAL-SAFE CONTRACT: fork() in the multithreaded BEAM
 * duplicates only this thread; a lock held by any other thread stays
 * locked forever in the child. The child therefore touches NOTHING
 * but direct syscalls (mount, write, close, _exit) -- no malloc, no
 * erl_nif, no pthread primitives -- and reports its errno over a pipe.
 * Every buffer it reads (j->target etc.) was allocated before fork. */
static void do_mount_in_pidns(struct mount_job *j)
{
	int pidns = open(j->pidns_path, O_RDONLY | O_CLOEXEC);
	if (pidns < 0) {
		j->r.err = errno;
		j->r.stage = "open_pidns";
		return;
	}

	if (setns(pidns, CLONE_NEWPID) < 0) {
		j->r.err = errno;
		j->r.stage = "setns_pid";
		close(pidns);
		return;
	}

	int pfd[2];
	if (pipe(pfd) < 0) {
		j->r.err = errno;
		j->r.stage = "pipe";
		close(pidns);
		return;
	}

	pid_t pid = fork();
	if (pid < 0) {
		j->r.err = errno;
		j->r.stage = "fork";
		close(pfd[0]);
		close(pfd[1]);
		close(pidns);
		return;
	}

	if (pid == 0) {
		/* CHILD -- async-signal-safe only (see contract above). */
		close(pfd[0]);
		int e = 0;
		if (mount(j->source, j->target, j->fstype, j->flags, j->data) < 0)
			e = errno;
		ssize_t w = write(pfd[1], &e, sizeof e);
		(void)w;
		_exit(0);
	}

	/* PARENT (the worker thread). */
	close(pfd[1]);
	int child_err = 0;
	ssize_t n = read(pfd[0], &child_err, sizeof child_err);
	close(pfd[0]);

	int status;
	while (waitpid(pid, &status, 0) < 0 && errno == EINTR)
		;

	if (n != (ssize_t)sizeof child_err) {
		/* couldn't read the child's errno -- report a generic failure */
		j->r.err = EINVAL;
		j->r.stage = "mount";
	} else if (child_err != 0) {
		j->r.err = child_err;
		j->r.stage = "mount";
	}

	close(pidns);
}

static void *mount_worker(void *arg)
{
	struct mount_job *j = arg;

	int ns = enter_target_ns(&j->r, j->ns_path);
	if (ns < 0)
		return NULL;

	if (j->create_target) {
		int e = ensure_target_file(j->target);
		if (e) {
			j->r.err = e;
			j->r.stage = "create";
			close(ns);
			return NULL;
		}
	}

	if (j->pidns_path && j->pidns_path[0] != '\0') {
		do_mount_in_pidns(j);
	} else if (mount(j->source, j->target, j->fstype, j->flags, j->data) < 0) {
		j->r.err = errno;
		j->r.stage = "mount";
	}

	close(ns);
	return NULL;
}

static void *umount_worker(void *arg)
{
	struct umount_job *j = arg;

	int ns = enter_target_ns(&j->r, j->ns_path);
	if (ns < 0)
		return NULL;

	if (umount2(j->target, j->flags) < 0) {
		j->r.err = errno;
		j->r.stage = "umount";
	}

	close(ns);
	return NULL;
}

/* pivot_root requires the calling thread's CWD to be inside
 * new_root before the syscall (kernel-enforced). We chdir on the
 * worker thread so the BEAM's CWD is untouched.
 *
 * Empty ns_path = BEAM's own mount namespace. We still need to
 * unshare(CLONE_FS) so the chdir is private to this thread; we
 * just skip the open+setns. */
static void *pivot_root_worker(void *arg)
{
	struct pivot_root_job *j = arg;
	int ns = -1;

	if (j->ns_path[0] == '\0') {
		/* BEAM ns: unshare so the chdir is thread-local. */
		if (unshare(CLONE_FS) < 0) {
			j->r.err = errno;
			j->r.stage = "unshare";
			return NULL;
		}
	} else {
		ns = enter_target_ns(&j->r, j->ns_path);
		if (ns < 0)
			return NULL;
	}

	if (chdir(j->new_root) < 0) {
		j->r.err = errno;
		j->r.stage = "chdir";
		if (ns >= 0) close(ns);
		return NULL;
	}

	/* glibc doesn't ship a pivot_root() wrapper; go direct. */
	if (syscall(SYS_pivot_root, j->new_root, j->put_old) < 0) {
		j->r.err = errno;
		j->r.stage = "pivot_root";
	}

	if (ns >= 0) close(ns);
	return NULL;
}

/* --- mount/6 ------------------------------------------------------------ */

/* Args: source, target, fstype, flags (uint64), data (binary),
 *       ns_path (binary), pidns_path (binary), create_target (int).
 *
 * Empty `ns_path`: mount in the caller's namespace (no thread).
 * Non-empty `ns_path`: spawn a worker that enters that namespace.
 *
 * Non-empty `pidns_path` (a /proc/<pid>/ns/pid file): the worker also
 * enters that PID namespace and fork()s a child to perform the mount,
 * so procfs binds to the container's pid namespace (see
 * do_mount_in_pidns). Only meaningful alongside a non-empty ns_path.
 *
 * `create_target` != 0: create an empty file at `target` (inside the
 * target mount ns) before mounting -- a placeholder for a device-node
 * bind onto a fresh tmpfs (see ensure_target_file).
 *
 * `source` and `data` and `fstype` may be empty binaries; the NIF passes
 * NULL to mount(2) for any that are empty (kernel-idiomatic for
 * propagation changes, MS_MOVE, MS_REMOUNT, etc.). */
static ERL_NIF_TERM nif_mount(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
	(void)argc;

	ErlNifUInt64 flags;
	if (!enif_get_uint64(env, argv[3], &flags))
		return enif_make_badarg(env);

	int create_target;
	if (!enif_get_int(env, argv[7], &create_target))
		return enif_make_badarg(env);

	char *source     = binary_to_cstr(env, argv[0]);
	char *target     = binary_to_cstr(env, argv[1]);
	char *fstype     = binary_to_cstr(env, argv[2]);
	char *data       = binary_to_cstr(env, argv[4]);
	char *ns_path    = binary_to_cstr(env, argv[5]);
	char *pidns_path = binary_to_cstr(env, argv[6]);

	if (!target || !fstype || !source || !data || !ns_path || !pidns_path) {
		enif_free(source);  enif_free(target);
		enif_free(fstype);  enif_free(data);
		enif_free(ns_path); enif_free(pidns_path);
		return enif_make_badarg(env);
	}

	const char *src_arg    = source[0] ? source : NULL;
	const char *fstype_arg = fstype[0] ? fstype : NULL;
	const char *data_arg   = data[0]   ? data   : NULL;

	ERL_NIF_TERM result;

	if (ns_path[0] == '\0') {
		/* BEAM namespace -- direct syscall, no thread. (pidns_path is
		 * only meaningful with a target mount ns, so it's ignored here.) */
		int cerr = create_target ? ensure_target_file(target) : 0;
		if (cerr)
			result = make_error(env, "create", cerr);
		else if (mount(src_arg, target, fstype_arg, (unsigned long)flags, data_arg) < 0)
			result = make_error(env, "mount", errno);
		else
			result = ok_atom(env);
	} else {
		/* Cross-namespace -- worker thread. */
		struct mount_job job = {
			.r = { .err = 0, .stage = NULL },
			.ns_path = ns_path,
			.source  = src_arg,
			.target  = target,
			.fstype  = fstype_arg,
			.flags   = (unsigned long)flags,
			.data    = data_arg,
			.pidns_path    = pidns_path,
			.create_target = create_target,
		};

		ErlNifTid tid;
		int rc = enif_thread_create("linx_mount", &tid, mount_worker, &job, NULL);
		if (rc != 0) {
			result = make_error(env, "thread", rc);
		} else {
			enif_thread_join(tid, NULL);
			result = job.r.err
				? make_error(env, job.r.stage, job.r.err)
				: ok_atom(env);
		}
	}

	enif_free(source);  enif_free(target);
	enif_free(fstype);  enif_free(data);
	enif_free(ns_path); enif_free(pidns_path);

	return result;
}

/* --- umount/3 ----------------------------------------------------------- */

/* Args: target (binary), flags (int), ns_path (binary). */
static ERL_NIF_TERM nif_umount(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
	(void)argc;

	int flags;
	if (!enif_get_int(env, argv[1], &flags))
		return enif_make_badarg(env);

	char *target  = binary_to_cstr(env, argv[0]);
	char *ns_path = binary_to_cstr(env, argv[2]);

	if (!target || !ns_path) {
		enif_free(target);
		enif_free(ns_path);
		return enif_make_badarg(env);
	}

	ERL_NIF_TERM result;

	if (ns_path[0] == '\0') {
		if (umount2(target, flags) < 0)
			result = make_error(env, "umount", errno);
		else
			result = ok_atom(env);
	} else {
		struct umount_job job = {
			.r = { .err = 0, .stage = NULL },
			.ns_path = ns_path,
			.target  = target,
			.flags   = flags,
		};

		ErlNifTid tid;
		int rc = enif_thread_create("linx_umount", &tid, umount_worker, &job, NULL);
		if (rc != 0) {
			result = make_error(env, "thread", rc);
		} else {
			enif_thread_join(tid, NULL);
			result = job.r.err
				? make_error(env, job.r.stage, job.r.err)
				: ok_atom(env);
		}
	}

	enif_free(target);
	enif_free(ns_path);

	return result;
}

/* --- pivot_root/3 ------------------------------------------------------- */

/* Args: new_root (binary), put_old (binary), ns_path (binary).
 *
 * Always runs on a worker thread -- even in the BEAM-namespace
 * case, because pivot_root requires the calling thread's CWD to be
 * inside new_root, and we don't want to change the BEAM's CWD. The
 * worker unshare(CLONE_FS)'s first so its chdir is isolated. */
static ERL_NIF_TERM nif_pivot_root(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[])
{
	(void)argc;

	char *new_root = binary_to_cstr(env, argv[0]);
	char *put_old  = binary_to_cstr(env, argv[1]);
	char *ns_path  = binary_to_cstr(env, argv[2]);

	if (!new_root || !put_old || !ns_path) {
		enif_free(new_root);
		enif_free(put_old);
		enif_free(ns_path);
		return enif_make_badarg(env);
	}

	struct pivot_root_job job = {
		.r = { .err = 0, .stage = NULL },
		.ns_path  = ns_path,
		.new_root = new_root,
		.put_old  = put_old,
	};

	ERL_NIF_TERM result;
	ErlNifTid tid;
	int rc = enif_thread_create("linx_pivot_root", &tid, pivot_root_worker, &job, NULL);
	if (rc != 0) {
		result = make_error(env, "thread", rc);
	} else {
		enif_thread_join(tid, NULL);
		result = job.r.err
			? make_error(env, job.r.stage, job.r.err)
			: ok_atom(env);
	}

	enif_free(new_root);
	enif_free(put_old);
	enif_free(ns_path);

	return result;
}

/* --- NIF init ----------------------------------------------------------- */

/* mount/umount/pivot_root get the dirty-I/O-bound flag because
 * (a) the cross-namespace path spawns a thread + opens a file,
 * and (b) the underlying syscalls on real filesystems (NFS,
 * network mounts, large superblock reads) can take milliseconds.
 * version/0 stays on a normal scheduler -- it just returns a
 * string. */
static ErlNifFunc nif_funcs[] = {
	{ "version",    0, version,        0                          },
	{ "mount",      8, nif_mount,      ERL_NIF_DIRTY_JOB_IO_BOUND },
	{ "umount",     3, nif_umount,     ERL_NIF_DIRTY_JOB_IO_BOUND },
	{ "pivot_root", 3, nif_pivot_root, ERL_NIF_DIRTY_JOB_IO_BOUND },
};

ERL_NIF_INIT(Elixir.Linx.Mount.Native, nif_funcs, NULL, NULL, NULL, NULL)