Skip to main content

c_src/netlink_socket.c

/*
 * netlink_socket -- open an AF_NETLINK socket inside a given network
 * namespace, and hand the file descriptor back to the BEAM.
 *
 * This NIF exists because Linx.Netlink is otherwise pure Elixir: the BEAM can
 * open and drive a netlink socket on its own, but only in its own network
 * namespace. Reaching another netns needs setns(2), which acts per-thread --
 * unsafe to do on a BEAM scheduler, where it would strand unrelated work in
 * the wrong namespace.
 *
 * WHY A THREAD
 * ------------
 * setns(2) with CLONE_NEWNET changes the network namespace of the *calling
 * thread* only. A socket, once created, is permanently bound to whatever
 * network namespace was current on its creating thread at socket() time --
 * it carries that namespace for its whole life, no matter which thread later
 * uses the fd.
 *
 * So a throwaway thread does setns() into the target netns, opens the socket
 * there, and exits. The kernel destroys that thread's namespace membership
 * when it exits -- unconditionally, on every code path, with nothing to
 * restore -- while the socket fd survives, already pinned to the target netns
 * and usable from any BEAM thread.
 *
 * The alternative (setns in, socket(), setns back, all on a dirty scheduler
 * thread) is correct only if every error path restores the namespace; a
 * single missed branch leaves a *shared* scheduler thread in the wrong netns,
 * silently corrupting unrelated later NIF calls. The throwaway thread makes
 * that failure mode structurally impossible.
 *
 * CONTRACT
 * --------
 *   open_in_netns(NetnsPath::binary, Protocol::integer)
 *     -> {ok, Fd::integer}              the netns-pinned AF_NETLINK fd
 *     -> {error, {Stage::atom, Errno::integer}}
 *   close_fd(Fd::integer) -> ok         close an open_in_netns fd that
 *                                       :socket.open/1 did not adopt
 *
 * NetnsPath names a network-namespace file -- typically /proc/<pid>/ns/net,
 * or /proc/self/ns/net for the BEAM's own netns. Protocol is the netlink
 * protocol number passed to socket(2) (NETLINK_ROUTE, NETLINK_GENERIC, ...).
 * Stage is one of: open, setns, socket, thread.
 *
 * The returned fd belongs to the caller: it is a process-wide descriptor in
 * the BEAM's OS process and is not closed here. The Elixir side adopts it
 * with :socket.open/1 -- the socket object then owns and closes the fd. If
 * that adopt fails, the fd is still the caller's; close_fd/1 disposes of it.
 */

#include <erl_nif.h>

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <linux/netlink.h> /* struct sockaddr_nl */
#include <sched.h>      /* setns, CLONE_NEWNET */
#include <string.h>
#include <sys/socket.h> /* socket, AF_NETLINK, SOCK_RAW, SOCK_CLOEXEC */
#include <unistd.h>

/* Work handed to the worker thread, and its results read back afterward. */
struct netns_job {
	const char *path;  /* the network-namespace file to setns() into */
	int protocol;      /* netlink protocol number for socket(2) */
	int sock;          /* result: the AF_NETLINK fd, or -1 on failure */
	int err;           /* result: errno of the failing step, or 0 */
	const char *stage; /* result: which step failed ("open"/"setns"/...) */
};

/* Runs on a private thread, created and joined within one NIF call. It does
 * setns() into the target netns and opens the socket there; when the thread
 * returns, the kernel discards its netns membership -- so there is
 * deliberately nothing to undo, on success or on any error path. */
static void *netns_worker(void *arg)
{
	struct netns_job *j = arg;

	int ns = open(j->path, O_RDONLY | O_CLOEXEC);
	if (ns < 0) {
		j->err = errno;
		j->stage = "open";
		return NULL;
	}

	/* setns() moves only this thread into the target network namespace.
	 * The thread is discarded right after, so no BEAM scheduler ever runs
	 * in this namespace. */
	if (setns(ns, CLONE_NEWNET) < 0) {
		j->err = errno;
		j->stage = "setns";
		close(ns);
		return NULL;
	}

	/* The socket is pinned to the target netns from here on, for any
	 * thread that later uses the fd. SOCK_CLOEXEC so it is never inherited
	 * by a process the BEAM later fork+execs -- a leaked netns-bound fd
	 * would be a namespace-confinement hole. */
	j->sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, j->protocol);
	if (j->sock < 0) {
		j->err = errno;
		j->stage = "socket";
	}

	close(ns);
	return NULL; /* thread exits -> its netns membership is destroyed */
}

/* Build {error, {Stage::atom, Errno::integer}}. */
static ERL_NIF_TERM make_error(ErlNifEnv *env, const char *stage, int err)
{
	return enif_make_tuple2(
		env, enif_make_atom(env, "error"),
		enif_make_tuple2(env, enif_make_atom(env, stage),
				 enif_make_int(env, err)));
}

/* open_in_netns(NetnsPath::binary, Protocol::integer)
 *   -> {ok, Fd} | {error, {Stage, Errno}}.
 *
 * Flagged ERL_NIF_DIRTY_JOB_IO_BOUND: it creates and joins a thread and does
 * filesystem opens, so it runs on a dirty I/O scheduler rather than blocking
 * a normal scheduler. The work itself is sub-millisecond. */
static ERL_NIF_TERM open_in_netns(ErlNifEnv *env, int argc,
				  const ERL_NIF_TERM argv[])
{
	(void)argc;

	ErlNifBinary path;
	if (!enif_inspect_iolist_as_binary(env, argv[0], &path))
		return enif_make_badarg(env);

	int protocol;
	if (!enif_get_int(env, argv[1], &protocol))
		return enif_make_badarg(env);

	/* open() needs a NUL-terminated C string. */
	char cpath[PATH_MAX];
	if (path.size >= sizeof cpath)
		return make_error(env, "open", ENAMETOOLONG);
	memcpy(cpath, path.data, path.size);
	cpath[path.size] = '\0';

	struct netns_job job = {
		.path = cpath,
		.protocol = protocol,
		.sock = -1,
		.err = 0,
		.stage = NULL,
	};

	/* enif_thread_create returns an errno-like code (not via errno). */
	ErlNifTid tid;
	int rc = enif_thread_create("linx_netns", &tid, netns_worker, &job, NULL);
	if (rc != 0)
		return make_error(env, "thread", rc);
	enif_thread_join(tid, NULL);

	if (job.sock < 0)
		return make_error(env, job.stage, job.err);

	return enif_make_tuple2(env, enif_make_atom(env, "ok"),
				enif_make_int(env, job.sock));
}

/* close_fd(Fd::integer) -> ok
 *
 * Disposes of a descriptor from open_in_netns on the one path where Elixir
 * itself still owns it: a successful open_in_netns whose fd :socket.open/1
 * then declined to adopt. close(2) on a netlink socket does not block, so
 * this is a plain (non-dirty) NIF. */
static ERL_NIF_TERM close_fd(ErlNifEnv *env, int argc,
			     const ERL_NIF_TERM argv[])
{
	(void)argc;

	int fd;
	if (!enif_get_int(env, argv[0], &fd))
		return enif_make_badarg(env);

	close(fd);
	return enif_make_atom(env, "ok");
}

/* bind_netlink(Fd::integer, GroupsBitmask::integer) -> ok | {error, Errno}
 *
 * Binds the netlink fd with nl_pid = 0 (kernel auto-assigns) and the given
 * group-membership bitmask. Erlang's :socket.bind/2 doesn't accept netlink
 * sockaddrs, so we do this in C.
 *
 * For modern multi-group subscriptions, NETLINK_ADD_MEMBERSHIP via
 * setsockopt is preferred — but the socket must still be bound first to
 * receive multicast (else the kernel never delivers events to it). Calling
 * this with groups=0 just binds for auto-port assignment. */
static ERL_NIF_TERM bind_netlink(ErlNifEnv *env, int argc,
				 const ERL_NIF_TERM argv[])
{
	(void)argc;

	int fd;
	if (!enif_get_int(env, argv[0], &fd))
		return enif_make_badarg(env);

	unsigned int groups;
	if (!enif_get_uint(env, argv[1], &groups))
		return enif_make_badarg(env);

	struct sockaddr_nl addr = {
		.nl_family = AF_NETLINK,
		.nl_pid = 0, /* let the kernel auto-assign */
		.nl_groups = groups,
	};

	if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
		return enif_make_tuple2(
			env, enif_make_atom(env, "error"),
			enif_make_int(env, errno));
	}

	return enif_make_atom(env, "ok");
}

static ErlNifFunc nif_funcs[] = {
	{"open_in_netns", 2, open_in_netns, ERL_NIF_DIRTY_JOB_IO_BOUND},
	{"close_fd", 1, close_fd, 0},
	{"bind_netlink", 2, bind_netlink, 0},
};

ERL_NIF_INIT(Elixir.Linx.Netlink.Socket.Native, nif_funcs, NULL, NULL, NULL, NULL)