c_src/erllama_nif.c

Select File:
// Copyright (c) 2026 Benoit Chesneau. Licensed under the MIT License.
// See the LICENSE file at the project root.

/*
 * erllama_nif: single NIF for erllama (cache + llama.cpp surface).
 *
 * v0.2 surface:
 *   crc32c(IoData) -> non_neg_integer()              [dirty CPU]
 *   fsync_dir(Path) -> ok | {error, atom()}          [dirty IO]
 *   load_model(Path, Opts) -> {ok, ModelRes} | ...   [dirty IO]
 *   free_model(ModelRes) -> ok                       [regular]
 *   new_context(ModelRes, Opts) -> {ok, CtxRes} | .. [dirty CPU]
 *   free_context(CtxRes) -> ok                       [regular]
 *   tokenize(ModelRes, Text, Opts) -> [token_id()]   [dirty CPU]
 *   kv_pack(CtxRes, _Tokens, _NTokens) -> Binary     [dirty CPU]
 *   kv_unpack(CtxRes, Binary, SeqId) -> ok | err     [dirty CPU]
 *
 * Resource ownership: model and context resources hold pointers to
 * llama.cpp objects. Their destructors call llama_model_free /
 * llama_free. The context resource also holds a refcount on its
 * model resource via enif_keep_resource so the model survives as
 * long as any context derived from it does.
 */
#include <erl_nif.h>

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/* Sentinel returned by erllama_safe_decode when llama_decode threw a
 * C++ exception. Distinct from any documented llama_decode return
 * (currently 0/1/-1/2). Defined here and in erllama_safe.cpp; both
 * sides must agree. */
#define ERLLAMA_DECODE_EXC_SENTINEL INT_MIN

#include "crc32c.h"
#include "llama.h"

/* Exception-safe wrappers for llama.cpp calls that can throw across
 * the C ABI. Implemented in c_src/erllama_safe.cpp. Each returns a
 * sentinel (NULL, 0, SIZE_MAX, INT32_MIN, etc.) on a thrown C++
 * exception so the C NIF can surface a clean {error, oom} or
 * {error, invalid_token} instead of letting an exception unwind into
 * a C frame. */
extern struct llama_sampler *erllama_safe_sampler_chain_init(
    struct llama_sampler_chain_params p);
extern struct llama_sampler *erllama_safe_sampler_init_greedy(void);
extern struct llama_sampler *erllama_safe_sampler_init_dist(uint32_t seed);
extern struct llama_sampler *erllama_safe_sampler_init_top_k(int32_t k);
extern struct llama_sampler *erllama_safe_sampler_init_top_p(float p,
                                                             size_t min_keep);
extern struct llama_sampler *erllama_safe_sampler_init_min_p(float p,
                                                             size_t min_keep);
extern struct llama_sampler *erllama_safe_sampler_init_temp(float t);
extern struct llama_sampler *erllama_safe_sampler_init_penalties(
    int32_t last_n, float repeat, float freq, float present);
extern int erllama_safe_sampler_chain_add(struct llama_sampler *chain,
                                          struct llama_sampler *s);
extern int erllama_safe_sampler_free(struct llama_sampler *s);
extern llama_token erllama_safe_sampler_sample(struct llama_sampler *s,
                                                struct llama_context *ctx,
                                                int32_t idx);
extern int erllama_safe_sampler_accept(struct llama_sampler *s,
                                       llama_token tok);
extern int32_t erllama_safe_token_to_piece(const struct llama_vocab *vocab,
                                           llama_token tok, char *buf,
                                           int32_t buf_size,
                                           int32_t lstrip,
                                           bool special);
extern int erllama_safe_backend_init(void);
extern int erllama_safe_backend_init_once(void);
extern int erllama_safe_backend_free(void);
extern struct llama_model *erllama_safe_model_load_from_file(
    const char *path, struct llama_model_params params);
extern int erllama_safe_model_free(struct llama_model *m);
extern struct llama_context *erllama_safe_init_from_model(
    struct llama_model *m, struct llama_context_params params);
extern int erllama_safe_free(struct llama_context *c);
extern const struct llama_model *erllama_safe_get_model(
    const struct llama_context *c);
extern const struct llama_vocab *erllama_safe_model_get_vocab(
    const struct llama_model *m);
extern int32_t erllama_safe_vocab_n_tokens(const struct llama_vocab *v);
extern int erllama_safe_vocab_is_eog(const struct llama_vocab *v,
                                     llama_token tok);
extern int32_t erllama_safe_tokenize(const struct llama_vocab *vocab,
                                     const char *text, int32_t text_len,
                                     llama_token *tokens, int32_t n_max,
                                     bool add_special, bool parse_special);
extern int erllama_safe_decode(struct llama_context *c,
                               struct llama_batch batch);
extern size_t erllama_safe_state_seq_get_size(struct llama_context *c,
                                              int seq_id);
extern size_t erllama_safe_state_seq_get_data(struct llama_context *c,
                                              uint8_t *dst, size_t size,
                                              int seq_id);
extern size_t erllama_safe_state_seq_set_data(struct llama_context *c,
                                              const uint8_t *src,
                                              size_t size, int seq_id);
extern int erllama_safe_memory_seq_rm(struct llama_context *c, int seq_id,
                                      int p0, int p1);
extern const char *erllama_safe_model_chat_template(const struct llama_model *m,
                                                    const char *name);
extern int32_t erllama_safe_chat_apply_template(
    const char *tmpl, const struct llama_chat_message *msgs, size_t n_msgs,
    bool add_assistant, char *buf, int32_t buf_size);
extern struct llama_sampler *erllama_safe_sampler_init_grammar(
    const struct llama_vocab *vocab, const char *grammar_str,
    const char *grammar_root);
extern struct llama_adapter_lora *erllama_safe_adapter_lora_init(
    struct llama_model *model, const char *path);
extern void erllama_safe_adapter_lora_free(struct llama_adapter_lora *a);
extern int erllama_safe_set_adapters_lora(struct llama_context *ctx,
                                          struct llama_adapter_lora **adapters,
                                          size_t n_adapters, float *scales);
extern float *erllama_safe_get_embeddings_seq(struct llama_context *c,
                                              int seq_id);
extern float *erllama_safe_get_embeddings(struct llama_context *c);
extern int32_t erllama_safe_n_embd(const struct llama_model *m);
extern int erllama_safe_set_embeddings(struct llama_context *c, bool value);

#ifndef ERLLAMA_MAX_TOKENS
/* Cap on accepted token-list inputs and tokenize output. The largest
 * practical context window today is ~10M; 1M tokens leaves plenty of
 * headroom while bounding worst-case allocations to ~4 MB and keeping
 * one bad request from tying up dirty schedulers indefinitely. */
#define ERLLAMA_MAX_TOKENS (1024 * 1024)
#endif

#ifndef ERLLAMA_MAX_TOKEN_TEXT
/* Largest text accepted by tokenize/3 (bytes). 4 MiB covers ~1 M
 * tokens at ~4 bytes each, well above any realistic chat prompt
 * while keeping a single bad request from chewing dirty-scheduler
 * time. Override at build time via -DERLLAMA_MAX_TOKEN_TEXT=N for
 * batch-tokenization workflows. */
#define ERLLAMA_MAX_TOKEN_TEXT (4 * 1024 * 1024)
#endif

/* =========================================================================
 * Atoms
 * ========================================================================= */

static ERL_NIF_TERM atom_ok;
static ERL_NIF_TERM atom_error;
static ERL_NIF_TERM atom_load_failed;
static ERL_NIF_TERM atom_context_failed;
static ERL_NIF_TERM atom_tokenize_failed;
static ERL_NIF_TERM atom_pack_failed;
static ERL_NIF_TERM atom_unpack_failed;
static ERL_NIF_TERM atom_true;
static ERL_NIF_TERM atom_false;
static ERL_NIF_TERM atom_released;
static ERL_NIF_TERM atom_too_large;
static ERL_NIF_TERM atom_invalid_token;
static ERL_NIF_TERM atom_oom;
static ERL_NIF_TERM atom_deferred;
static ERL_NIF_TERM atom_exception;
static ERL_NIF_TERM atom_no_logits;
static ERL_NIF_TERM atom_no_template;
static ERL_NIF_TERM atom_template_failed;
static ERL_NIF_TERM atom_grammar_failed;
static ERL_NIF_TERM atom_embed_failed;
static ERL_NIF_TERM atom_not_supported;

/* Forward decl: build_default_greedy_chain is defined in the sampler
 * section but used as a lazy fallback in nif_decode_one. */
static struct llama_sampler *build_default_greedy_chain(void);

/* Forward decl: adapter_dtor is defined later but registered in the
 * load callback. */
static void adapter_dtor(ErlNifEnv *env, void *obj);

/* Forward decl: sampler_dtor + the build helper, defined in the
 * sampler section. */
static void sampler_dtor(ErlNifEnv *env, void *obj);
static struct llama_sampler *build_sampler_chain_from_map(
    ErlNifEnv *env, ERL_NIF_TERM cfg, struct llama_context *ctx,
    ERL_NIF_TERM *out_err_atom);

/* =========================================================================
 * Resource types
 * ========================================================================= */

/* Per-resource mutex makes use-after-free between concurrent dirty
 * NIFs and an explicit free call impossible: every NIF entry that
 * dereferences a resource locks it, observes the pointer, and runs
 * llama under that lock; explicit frees take the same lock, so they
 * cannot interleave with a live llama call. The lock is held for the
 * duration of a llama op, but ops on different resources stay
 * independent. */
typedef struct {
    pthread_mutex_t mu;
    int mu_inited;                 /* guard pthread_mutex_destroy on error path */
    struct llama_model *model;     /* NULL after successful release */
    int active_contexts;           /* nif_new_context bumps; ctx_dtor decrements */
    int release_pending;           /* free_model when active_contexts hit 0 */
} erllama_model_t;

typedef struct {
    pthread_mutex_t mu;
    int mu_inited;
    struct llama_context *ctx;     /* NULL after successful release */
    erllama_model_t *model_res;    /* keep_resource'd by new_context */
    int decode_ready;              /* set after llama_decode; cleared after kv ops */
    /* Sampler chain cached on the first nif_decode_one call. The
     * chain is greedy-only and lives for the resource's lifetime;
     * a future sampler-config NIF would free + rebuild this under
     * the resource lock. */
    struct llama_sampler *smpl;
} erllama_context_t;

/* LoRA adapter resource. The adapter is bound to a model and stays
 * valid until the model is freed or adapter_lora_free is called
 * explicitly. The wrapping resource holds a keep-reference on its
 * model_res so the underlying llama_model* outlives the adapter even
 * if the user free_model's it. */
typedef struct {
    pthread_mutex_t mu;
    int mu_inited;
    struct llama_adapter_lora *adapter; /* NULL after explicit free */
    erllama_model_t *model_res;         /* keep_resource'd at init */
} erllama_adapter_t;

/* Sampler chain resource. Owned independently from the context so
 * multi-seq batching (v0.2+) can hold one chain per in-flight
 * request without contending on the context's cached `c->smpl`.
 * The chain is built from the same config map configure_sampler/2
 * consumes; freed explicitly via sampler_free/1 or implicitly by
 * the dtor. */
typedef struct {
    pthread_mutex_t mu;
    int mu_inited;
    struct llama_sampler *chain;   /* NULL after explicit free */
    erllama_context_t *ctx_res;    /* keep_resource'd at init */
} erllama_sampler_t;

static ErlNifResourceType *MODEL_RT;
static ErlNifResourceType *CTX_RT;
static ErlNifResourceType *ADAPTER_RT;
static ErlNifResourceType *SAMPLER_RT;

/* Drop the context's reference on its model; if a previous
 * free_model/1 returned {ok, deferred} and the model is now
 * unreferenced, actually free the underlying llama_model* here. The
 * decision is made under the lock so concurrent context destructions
 * can't double-free. The free itself runs while the lock is still
 * held to keep the pointer non-observable mid-teardown. */
static void context_drops_model(erllama_model_t *m) {
    pthread_mutex_lock(&m->mu);
    if (m->active_contexts > 0) {
        m->active_contexts--;
    }
    if (m->release_pending && m->active_contexts == 0 && m->model) {
        (void) erllama_safe_model_free(m->model);
        m->model = NULL;
        m->release_pending = 0;
    }
    pthread_mutex_unlock(&m->mu);
}

/* Resource destructors run when the BEAM has no remaining references.
 * They must tolerate partial init: if alloc succeeded but mutex_init
 * failed, the dtor sees mu_inited=0 and skips pthread_mutex_destroy.
 * Pointer fields are zero-init'd by the allocation path so freeing a
 * NULL is a no-op here.
 *
 * Two accepted tradeoffs callers should know about:
 *
 *  1. A throwing llama destructor leaks the native object. C++
 *     destructors are required to be `noexcept`; if one throws
 *     anyway, the safe wrapper catches the exception and returns
 *     -1 but we still NULL the pointer so the destructor cannot
 *     be called twice. The native model/context is leaked rather
 *     than risking UB. Fix lives upstream in llama.cpp.
 *
 *  2. GC-triggered dtors run on the scheduler thread that
 *     triggered GC, not on a dirty scheduler. For prompt cleanup
 *     of a multi-MB model, callers should prefer
 *     `erllama:unload/1` (which terminates the per-model
 *     gen_statem and goes through `nif_free_context` -- a dirty
 *     CPU NIF) over relying on Erlang GC to destruct the
 *     resource. */
static void model_dtor(ErlNifEnv *env, void *obj) {
    (void) env;
    erllama_model_t *m = (erllama_model_t *) obj;
    /* The pointer is NULL after any successful or failed explicit
     * release, so this single check covers both paths and avoids
     * double-calling the safe wrapper. */
    if (m->model) {
        (void) erllama_safe_model_free(m->model);
        m->model = NULL;
    }
    if (m->mu_inited) {
        pthread_mutex_destroy(&m->mu);
        m->mu_inited = 0;
    }
}

static void ctx_dtor(ErlNifEnv *env, void *obj) {
    (void) env;
    erllama_context_t *c = (erllama_context_t *) obj;
    if (c->smpl) {
        (void) erllama_safe_sampler_free(c->smpl);
        c->smpl = NULL;
    }
    if (c->ctx) {
        (void) erllama_safe_free(c->ctx);
        c->ctx = NULL;
    }
    if (c->model_res) {
        context_drops_model(c->model_res);
        enif_release_resource(c->model_res);
        c->model_res = NULL;
    }
    if (c->mu_inited) {
        pthread_mutex_destroy(&c->mu);
        c->mu_inited = 0;
    }
}

/* Sampler chain destructor. Frees the chain (which may be NULL if
 * the user called sampler_free explicitly) and drops the
 * keep-reference on the owning context. */
static void sampler_dtor(ErlNifEnv *env, void *obj) {
    (void) env;
    erllama_sampler_t *s = (erllama_sampler_t *) obj;
    if (s->chain) {
        (void) erllama_safe_sampler_free(s->chain);
        s->chain = NULL;
    }
    if (s->ctx_res) {
        enif_release_resource(s->ctx_res);
        s->ctx_res = NULL;
    }
    if (s->mu_inited) {
        pthread_mutex_destroy(&s->mu);
        s->mu_inited = 0;
    }
}

/* Adapter destructor. Explicit nif_adapter_free zeroes
 * a->adapter under the lock, so this destructor is either a no-op
 * (already freed) or the implicit final cleanup. Either way it
 * releases the keep-reference on the model. */
static void adapter_dtor(ErlNifEnv *env, void *obj) {
    (void) env;
    erllama_adapter_t *a = (erllama_adapter_t *) obj;
    if (a->adapter) {
        erllama_safe_adapter_lora_free(a->adapter);
        a->adapter = NULL;
    }
    if (a->model_res) {
        enif_release_resource(a->model_res);
        a->model_res = NULL;
    }
    if (a->mu_inited) {
        pthread_mutex_destroy(&a->mu);
        a->mu_inited = 0;
    }
}

/* =========================================================================
 * Load callback
 * ========================================================================= */

static int load(ErlNifEnv *env, void **priv_data, ERL_NIF_TERM load_info) {
    (void) priv_data;
    (void) load_info;

    if (erllama_crc32c_init() != 0) {
        return -1;
    }
    /* llama_backend_init() is deferred to first model load via
     * erllama_safe_backend_init_once(). NIF load only sets up
     * resources and atoms. Cache-only and cache-test workloads
     * never invoke ggml_backend_load_all, which on some platforms
     * (notably FreeBSD when paired with another NIF that uses
     * mmap and signal handlers) perturbs process state in ways
     * that break unrelated code paths. */

    atom_ok = enif_make_atom(env, "ok");
    atom_error = enif_make_atom(env, "error");
    atom_load_failed = enif_make_atom(env, "load_failed");
    atom_context_failed = enif_make_atom(env, "context_failed");
    atom_tokenize_failed = enif_make_atom(env, "tokenize_failed");
    atom_pack_failed = enif_make_atom(env, "pack_failed");
    atom_unpack_failed = enif_make_atom(env, "unpack_failed");
    atom_true = enif_make_atom(env, "true");
    atom_false = enif_make_atom(env, "false");
    atom_released = enif_make_atom(env, "released");
    atom_too_large = enif_make_atom(env, "too_large");
    atom_invalid_token = enif_make_atom(env, "invalid_token");
    atom_oom = enif_make_atom(env, "oom");
    atom_deferred = enif_make_atom(env, "deferred");
    atom_exception = enif_make_atom(env, "exception");
    atom_no_logits = enif_make_atom(env, "no_logits");
    atom_no_template = enif_make_atom(env, "no_template");
    atom_template_failed = enif_make_atom(env, "template_failed");
    atom_grammar_failed = enif_make_atom(env, "grammar_failed");
    atom_embed_failed = enif_make_atom(env, "embed_failed");
    atom_not_supported = enif_make_atom(env, "not_supported");

    MODEL_RT = enif_open_resource_type(
        env, NULL, "erllama_model", model_dtor, ERL_NIF_RT_CREATE, NULL);
    if (!MODEL_RT) {
        return -1;
    }

    CTX_RT = enif_open_resource_type(
        env, NULL, "erllama_context", ctx_dtor, ERL_NIF_RT_CREATE, NULL);
    if (!CTX_RT) {
        return -1;
    }

    ADAPTER_RT = enif_open_resource_type(
        env, NULL, "erllama_adapter", adapter_dtor, ERL_NIF_RT_CREATE, NULL);
    if (!ADAPTER_RT) {
        return -1;
    }

    SAMPLER_RT = enif_open_resource_type(
        env, NULL, "erllama_sampler", sampler_dtor, ERL_NIF_RT_CREATE, NULL);
    if (!SAMPLER_RT) {
        return -1;
    }

    return 0;
}

static void unload(ErlNifEnv *env, void *priv_data) {
    (void) env;
    (void) priv_data;
    /* If backend_init_once ran, free the global llama state so a
     * NIF reload (hot upgrade, test runner) doesn't leak. If it
     * never ran, llama_backend_free is a no-op. */
    (void) erllama_safe_backend_free();
}

/* =========================================================================
 * Helpers
 * ========================================================================= */

static int copy_path(ErlNifEnv *env, ERL_NIF_TERM term, char *out, size_t cap) {
    ErlNifBinary bin;
    if (!enif_inspect_iolist_as_binary(env, term, &bin)) return 0;
    if (bin.size == 0 || bin.size >= cap) return 0;
    /* Reject embedded NUL: a Erlang binary like <<"real\0ignored">>
     * would be silently truncated by C string APIs to "real". */
    if (memchr(bin.data, '\0', bin.size) != NULL) return 0;
    memcpy(out, bin.data, bin.size);
    out[bin.size] = '\0';
    return 1;
}

/* Read an unsigned int but reject values that would wrap when cast
 * to int32_t. Used for llama options (n_gpu_layers, n_threads, etc.)
 * which are signed int32 fields in llama.cpp. */
static int get_map_int31(
    ErlNifEnv *env, ERL_NIF_TERM map, const char *key, int32_t *out
) {
    ERL_NIF_TERM v;
    ERL_NIF_TERM k = enif_make_atom(env, key);
    if (!enif_get_map_value(env, map, k, &v)) return 0;
    unsigned int u;
    if (!enif_get_uint(env, v, &u)) return 0;
    if (u > (unsigned int) INT32_MAX) return 0;
    *out = (int32_t) u;
    return 1;
}

static int get_map_uint(
    ErlNifEnv *env, ERL_NIF_TERM map, const char *key, unsigned int *out
) {
    ERL_NIF_TERM v;
    ERL_NIF_TERM k = enif_make_atom(env, key);
    if (!enif_get_map_value(env, map, k, &v)) return 0;
    return enif_get_uint(env, v, out);
}

/* Read a number from a map either as a float (`enif_get_double`) or as
 * an integer that gets promoted to double. Lets callers write
 * `temperature => 0` and `temperature => 0.7` interchangeably. */
static int get_map_double(
    ErlNifEnv *env, ERL_NIF_TERM map, const char *key, double *out
) {
    ERL_NIF_TERM v;
    ERL_NIF_TERM k = enif_make_atom(env, key);
    if (!enif_get_map_value(env, map, k, &v)) return 0;
    if (enif_get_double(env, v, out)) return 1;
    long ll;
    if (enif_get_long(env, v, &ll)) {
        *out = (double) ll;
        return 1;
    }
    return 0;
}

static int get_map_bool(
    ErlNifEnv *env, ERL_NIF_TERM map, const char *key, int *out
) {
    ERL_NIF_TERM v;
    ERL_NIF_TERM k = enif_make_atom(env, key);
    if (!enif_get_map_value(env, map, k, &v)) return 0;
    if (enif_compare(v, atom_true) == 0) {
        *out = 1;
        return 1;
    }
    if (enif_compare(v, atom_false) == 0) {
        *out = 0;
        return 1;
    }
    return 0;
}

/* =========================================================================
 * crc32c
 * ========================================================================= */

static ERL_NIF_TERM nif_crc32c(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    ErlNifBinary bin;
    if (!enif_inspect_iolist_as_binary(env, argv[0], &bin)) {
        return enif_make_badarg(env);
    }
    uint32_t crc = erllama_crc32c_update(0, bin.data, bin.size);
    return enif_make_uint(env, crc);
}

/* =========================================================================
 * Model
 * ========================================================================= */

static ERL_NIF_TERM nif_load_model(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    char path[4097];
    if (!copy_path(env, argv[0], path, sizeof(path))) {
        return enif_make_badarg(env);
    }
    if (!enif_is_map(env, argv[1])) {
        return enif_make_badarg(env);
    }

    if (erllama_safe_backend_init_once() != 0) {
        return enif_make_tuple2(env, atom_error, atom_load_failed);
    }

    struct llama_model_params params = llama_model_default_params();

    int32_t i32;
    if (get_map_int31(env, argv[1], "n_gpu_layers", &i32)) {
        params.n_gpu_layers = i32;
    }
    int b;
    if (get_map_bool(env, argv[1], "use_mmap", &b)) params.use_mmap = b ? true : false;
    if (get_map_bool(env, argv[1], "use_mlock", &b)) params.use_mlock = b ? true : false;
    if (get_map_bool(env, argv[1], "vocab_only", &b)) params.vocab_only = b ? true : false;

    struct llama_model *model = erllama_safe_model_load_from_file(path, params);
    if (!model) {
        return enif_make_tuple2(env, atom_error, atom_load_failed);
    }

    erllama_model_t *res = enif_alloc_resource(MODEL_RT, sizeof(*res));
    if (!res) {
        (void) erllama_safe_model_free(model);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    /* Zero-init so the destructor on the alloc-but-not-fully-set-up
     * path sees model=NULL and mu_inited=0 and skips the dangerous
     * frees. */
    memset(res, 0, sizeof(*res));
    if (pthread_mutex_init(&res->mu, NULL) != 0) {
        enif_release_resource(res);
        (void) erllama_safe_model_free(model);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    res->mu_inited = 1;
    res->model = model;
    res->active_contexts = 0;

    ERL_NIF_TERM term = enif_make_resource(env, res);
    enif_release_resource(res);
    return enif_make_tuple2(env, atom_ok, term);
}

/* free_model/1 returns:
 *   ok                 -> released; subsequent ops on the term return error
 *   {ok, deferred}     -> contexts still hold this model; release flagged.
 *                         The last context destruction performs the actual
 *                         llama_model_free under context_drops_model.
 *   {error, released}  -> already released
 *
 * The lock blocks for the duration of any concurrent dirty NIF using
 * this resource, which is the point: free can never interleave with a
 * live llama_model_* call. */
static ERL_NIF_TERM nif_free_model(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&m->mu);
    if (!m->model) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    if (m->active_contexts > 0) {
        m->release_pending = 1;
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_ok, atom_deferred);
    }
    struct llama_model *to_free = m->model;
    /* Free under the lock so a concurrent state read can't observe a
     * mid-free m->model. The pointer is nulled regardless of the
     * wrapper's return: calling llama_model_free again on a freed
     * pointer is a double-free, and llama destructors are required
     * to be noexcept anyway -- if one throws we leak the native
     * object rather than risk UB. */
    int rc = erllama_safe_model_free(to_free);
    m->model = NULL;
    m->release_pending = 0;
    pthread_mutex_unlock(&m->mu);
    if (rc != 0) {
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    return atom_ok;
}

/* =========================================================================
 * Context
 * ========================================================================= */

static ERL_NIF_TERM nif_new_context(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    if (!enif_is_map(env, argv[1])) {
        return enif_make_badarg(env);
    }

    struct llama_context_params params = llama_context_default_params();

    unsigned int u;
    if (get_map_uint(env, argv[1], "n_ctx", &u)) params.n_ctx = (uint32_t) u;
    if (get_map_uint(env, argv[1], "n_batch", &u)) params.n_batch = (uint32_t) u;
    if (get_map_uint(env, argv[1], "n_ubatch", &u)) params.n_ubatch = (uint32_t) u;
    if (get_map_uint(env, argv[1], "n_seq_max", &u)) params.n_seq_max = (uint32_t) u;
    int32_t i32;
    if (get_map_int31(env, argv[1], "n_threads", &i32)) params.n_threads = i32;
    if (get_map_int31(env, argv[1], "n_threads_batch", &i32)) {
        params.n_threads_batch = i32;
    }
    int b;
    if (get_map_bool(env, argv[1], "embeddings", &b)) params.embeddings = b ? true : false;
    if (get_map_bool(env, argv[1], "offload_kqv", &b)) params.offload_kqv = b ? true : false;

    pthread_mutex_lock(&m->mu);
    if (!m->model) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    /* If free_model/1 has been called and is waiting for the last
     * context to drop, do not let a new caller resurrect the model
     * by attaching another context. The {ok, deferred} return is
     * a release contract: no new contexts allowed past that point. */
    if (m->release_pending) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    struct llama_context *ctx = erllama_safe_init_from_model(m->model, params);
    if (!ctx) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_context_failed);
    }
    erllama_context_t *res = enif_alloc_resource(CTX_RT, sizeof(*res));
    if (!res) {
        (void) erllama_safe_free(ctx);
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    memset(res, 0, sizeof(*res));
    if (pthread_mutex_init(&res->mu, NULL) != 0) {
        enif_release_resource(res);
        (void) erllama_safe_free(ctx);
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    res->mu_inited = 1;
    res->ctx = ctx;
    res->model_res = m;
    m->active_contexts++;
    enif_keep_resource(m);
    pthread_mutex_unlock(&m->mu);

    ERL_NIF_TERM term = enif_make_resource(env, res);
    enif_release_resource(res);
    return enif_make_tuple2(env, atom_ok, term);
}

static ERL_NIF_TERM nif_free_context(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    /* Free under the lock so a concurrent reader cannot observe the
     * pointer mid-teardown. On exception we still NULL the pointer
     * to avoid a double-free path through the destructor; the native
     * object is leaked rather than risking UB. */
    if (c->smpl) {
        (void) erllama_safe_sampler_free(c->smpl);
        c->smpl = NULL;
    }
    int free_rc = erllama_safe_free(c->ctx);
    c->ctx = NULL;
    c->decode_ready = 0;
    erllama_model_t *m = c->model_res;
    c->model_res = NULL;
    pthread_mutex_unlock(&c->mu);
    if (m) {
        context_drops_model(m);
        enif_release_resource(m);
    }
    if (free_rc != 0) {
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    return atom_ok;
}

/* =========================================================================
 * Tokenize
 * ========================================================================= */

static ERL_NIF_TERM nif_tokenize(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    ErlNifBinary text;
    if (!enif_inspect_iolist_as_binary(env, argv[1], &text)) {
        return enif_make_badarg(env);
    }
    if (text.size > ERLLAMA_MAX_TOKEN_TEXT) {
        return enif_make_tuple2(env, atom_error, atom_too_large);
    }
    if (!enif_is_map(env, argv[2])) {
        return enif_make_badarg(env);
    }

    int add_special = 1;
    int parse_special = 0;
    int b;
    if (get_map_bool(env, argv[2], "add_special", &b)) add_special = b;
    if (get_map_bool(env, argv[2], "parse_special", &b)) parse_special = b;

    pthread_mutex_lock(&m->mu);
    if (!m->model || m->release_pending) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    const struct llama_vocab *vocab = erllama_safe_model_get_vocab(m->model);
    if (!vocab) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }

    int32_t text_len = (int32_t) text.size;
    int32_t n_max = text_len + 8;
    if (n_max < 16) n_max = 16;
    if (n_max > ERLLAMA_MAX_TOKENS) n_max = ERLLAMA_MAX_TOKENS;

    llama_token *tokens = (llama_token *) enif_alloc(sizeof(llama_token) * (size_t) n_max);
    if (!tokens) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    int32_t n = erllama_safe_tokenize(
        vocab, (const char *) text.data, text_len, tokens,
        n_max, add_special ? true : false, parse_special ? true : false);
    if (n == INT32_MIN) {
        enif_free(tokens);
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (n < 0) {
        int32_t needed = -n;
        if (needed > ERLLAMA_MAX_TOKENS) {
            enif_free(tokens);
            pthread_mutex_unlock(&m->mu);
            return enif_make_tuple2(env, atom_error, atom_too_large);
        }
        enif_free(tokens);
        tokens = (llama_token *) enif_alloc(sizeof(llama_token) * (size_t) needed);
        if (!tokens) {
            pthread_mutex_unlock(&m->mu);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        n = erllama_safe_tokenize(
            vocab, (const char *) text.data, text_len, tokens,
            needed, add_special ? true : false, parse_special ? true : false);
    }
    pthread_mutex_unlock(&m->mu);
    if (n == INT32_MIN) {
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (n < 0) {
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_tokenize_failed);
    }

    ERL_NIF_TERM list = enif_make_list(env, 0);
    for (int32_t i = n - 1; i >= 0; i--) {
        list = enif_make_list_cell(env, enif_make_int(env, tokens[i]), list);
    }
    enif_free(tokens);
    return list;
}

/* =========================================================================
 * KV pack / unpack
 *
 * The 3-arg signatures preserve the v0.1 stub API. The Tokens and
 * NTokens / SeqId positional args are interpreted as documented in
 * include/llama.h: NTokens is unused (the in-memory API saves the
 * full state for the configured seq_id, defaulting to 0); SeqId is
 * the destination sequence id for unpack.
 * ========================================================================= */

static ERL_NIF_TERM nif_kv_pack(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    /* Tokens (argv[1]) is informational; NTokens (argv[2]) ignored.
     * The model layer must have prefilled exactly the desired prefix
     * before calling kv_pack. argv[3], when present (arity 4),
     * specifies which sequence to extract from. Default 0 keeps
     * existing 3-arity callers working. */
    llama_seq_id seq_id = 0;
    if (argc == 4) {
        int sid;
        if (!enif_get_int(env, argv[3], &sid) || sid < 0) {
            return enif_make_badarg(env);
        }
        seq_id = (llama_seq_id) sid;
    }

    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    size_t need = erllama_safe_state_seq_get_size(c->ctx, seq_id);
    if (need == SIZE_MAX) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (need == 0) {
        pthread_mutex_unlock(&c->mu);
        ErlNifBinary empty;
        if (!enif_alloc_binary(0, &empty)) {
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        return enif_make_binary(env, &empty);
    }
    ErlNifBinary out;
    if (!enif_alloc_binary(need, &out)) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    size_t written = erllama_safe_state_seq_get_data(
        c->ctx, out.data, out.size, seq_id);
    pthread_mutex_unlock(&c->mu);
    if (written == SIZE_MAX) {
        enif_release_binary(&out);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (written == 0 || written > need) {
        enif_release_binary(&out);
        return enif_make_tuple2(env, atom_error, atom_pack_failed);
    }
    if (written < need) {
        if (!enif_realloc_binary(&out, written)) {
            enif_release_binary(&out);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
    }
    return enif_make_binary(env, &out);
}

static ERL_NIF_TERM nif_kv_unpack(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    ErlNifBinary in;
    if (!enif_inspect_binary(env, argv[1], &in)) {
        return enif_make_badarg(env);
    }
    int seq_id;
    if (!enif_get_int(env, argv[2], &seq_id) || seq_id < 0) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    size_t consumed = erllama_safe_state_seq_set_data(
        c->ctx, in.data, in.size, seq_id);
    /* kv_unpack only restores KV cells, not the per-context logits
     * buffer; the model layer must drop the last cell and re-prefill
     * it before the next sample. Mark the context as not ready until
     * that primer runs. */
    c->decode_ready = 0;
    pthread_mutex_unlock(&c->mu);
    if (consumed == 0 || consumed != in.size) {
        return enif_make_tuple2(env, atom_error, atom_unpack_failed);
    }
    return atom_ok;
}

/* Remove the cells in [p0, p1) from the given sequence. p0 < 0 means
 * 0; p1 < 0 means infinity. Returns ok or {error, partial}. The save
 * format only stores KV cells; the per-context logits buffer is not
 * restored. So after kv_unpack the model layer drops the last cell of
 * the saved sequence and re-prefills the corresponding token to
 * regenerate logits for the next sample. */
static ERL_NIF_TERM nif_kv_seq_rm(ErlNifEnv *env, int argc,
                                  const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    int seq_id, p0, p1;
    if (!enif_get_int(env, argv[1], &seq_id) || seq_id < 0 ||
        !enif_get_int(env, argv[2], &p0) ||
        !enif_get_int(env, argv[3], &p1)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    int rc = erllama_safe_memory_seq_rm(c->ctx, seq_id, p0, p1);
    /* Removing cells invalidates last-decode logits; force a fresh
     * prefill before the next sample. */
    c->decode_ready = 0;
    pthread_mutex_unlock(&c->mu);
    if (rc != 0) {
        return enif_make_tuple2(env, atom_error, atom_unpack_failed);
    }
    return atom_ok;
}

/* =========================================================================
 * Prefill / decode_one / detokenize
 * ========================================================================= */

/* read_token_list: returns 1 ok, 0 badarg, -1 oom, -2 too_large,
 * -3 invalid_token (out-of-range value). */
static int read_token_list(ErlNifEnv *env, ERL_NIF_TERM list,
                           llama_token **out, int32_t *out_len) {
    unsigned int n;
    if (!enif_get_list_length(env, list, &n)) return 0;
    if (n > (unsigned int) ERLLAMA_MAX_TOKENS) return -2;
    if (n == 0) {
        *out = NULL;
        *out_len = 0;
        return 1;
    }
    llama_token *toks = enif_alloc(sizeof(llama_token) * (size_t) n);
    if (!toks) return -1;
    ERL_NIF_TERM head, tail = list;
    unsigned int i = 0;
    while (enif_get_list_cell(env, tail, &head, &tail)) {
        int v;
        if (!enif_get_int(env, head, &v)) {
            enif_free(toks);
            return 0;
        }
        if (v < 0) {
            enif_free(toks);
            return -3;
        }
        toks[i++] = (llama_token) v;
    }
    *out = toks;
    *out_len = (int32_t) n;
    return 1;
}

static ERL_NIF_TERM token_list_error(ErlNifEnv *env, int rc) {
    switch (rc) {
        case -1: return enif_make_tuple2(env, atom_error, atom_oom);
        case -2: return enif_make_tuple2(env, atom_error, atom_too_large);
        case -3: return enif_make_tuple2(env, atom_error, atom_invalid_token);
        default: return enif_make_badarg(env);
    }
}

static ERL_NIF_TERM nif_prefill(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    llama_token *tokens = NULL;
    int32_t n = 0;
    int rc = read_token_list(env, argv[1], &tokens, &n);
    if (rc != 1) return token_list_error(env, rc);
    if (n == 0) {
        if (tokens) enif_free(tokens);
        return atom_ok;
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    /* Validate token IDs against the model vocab before handing them
     * to llama_decode. An out-of-range positive ID would otherwise
     * reach `id_to_token.at(id)` deep inside llama and throw a C++
     * exception across the C ABI. */
    const struct llama_model *model = erllama_safe_get_model(c->ctx);
    const struct llama_vocab *vocab =
        model ? erllama_safe_model_get_vocab(model) : NULL;
    int32_t n_vocab = vocab ? erllama_safe_vocab_n_tokens(vocab) : 0;
    /* Fail closed if the vocab lookup failed: without n_vocab we
     * cannot validate token IDs, and an out-of-range positive ID
     * would reach `id_to_token.at(id)` deep inside llama and throw
     * a C++ exception across the C ABI. */
    if (n_vocab <= 0) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    for (int32_t i = 0; i < n; i++) {
        if (tokens[i] >= n_vocab) {
            pthread_mutex_unlock(&c->mu);
            enif_free(tokens);
            return enif_make_tuple2(env, atom_error, atom_invalid_token);
        }
    }
    struct llama_batch batch = llama_batch_get_one(tokens, n);
    int dr = erllama_safe_decode(c->ctx, batch);
    if (dr == 0) c->decode_ready = 1;
    pthread_mutex_unlock(&c->mu);
    enif_free(tokens);
    if (dr == ERLLAMA_DECODE_EXC_SENTINEL) {
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (dr != 0) {
        return enif_make_tuple2(env, atom_error, enif_make_int(env, dr));
    }
    return atom_ok;
}

static ERL_NIF_TERM nif_decode_one(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    /* `llama_sampler_sample` -> `llama_get_logits_ith` aborts via
     * GGML_ASSERT(logits != nullptr) when no decode has produced
     * sample-able logits yet. We can't catch that abort, so we refuse
     * to call sampler_sample unless the last successful op was a
     * decode (set by nif_prefill / by ourselves below). kv_unpack and
     * kv_seq_rm clear the flag; the model layer must re-prefill the
     * last token before sampling. */
    if (!c->decode_ready) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_no_logits);
    }

    /* Lazy-init the sampler chain on first use as a greedy fallback,
     * matching the behaviour callers got before configure_sampler/2
     * existed. The model layer should normally call configure_sampler
     * once per request before the first decode; this fallback keeps
     * the cache-only and stub-backed call sites working without
     * touching them. */
    if (!c->smpl) {
        struct llama_sampler *fallback = build_default_greedy_chain();
        if (!fallback) {
            pthread_mutex_unlock(&c->mu);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        c->smpl = fallback;
    }

    /* llama_sampler_sample calls llama_sampler_accept on the chain
     * internally; the chain stays cached, so accept lands on the
     * cached object. */
    llama_token tok = erllama_safe_sampler_sample(c->smpl, c->ctx, -1);
    if (tok < 0) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }

    const struct llama_model *model = erllama_safe_get_model(c->ctx);
    const struct llama_vocab *vocab =
        model ? erllama_safe_model_get_vocab(model) : NULL;
    int eog = vocab ? erllama_safe_vocab_is_eog(vocab, tok) : 0;

    llama_token tok_buf = tok;
    struct llama_batch batch = llama_batch_get_one(&tok_buf, 1);
    int rc = erllama_safe_decode(c->ctx, batch);
    if (rc == 0) c->decode_ready = 1;
    else c->decode_ready = 0;
    pthread_mutex_unlock(&c->mu);
    if (rc == ERLLAMA_DECODE_EXC_SENTINEL) {
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (rc != 0) {
        return enif_make_tuple2(env, atom_error, enif_make_int(env, rc));
    }
    ERL_NIF_TERM tag = eog ? enif_make_atom(env, "eog") : atom_ok;
    return enif_make_tuple2(env, tag, enif_make_int(env, tok));
}

static ERL_NIF_TERM nif_detokenize(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    llama_token *tokens = NULL;
    int32_t n = 0;
    int rc = read_token_list(env, argv[1], &tokens, &n);
    if (rc != 1) return token_list_error(env, rc);
    if (n == 0) {
        if (tokens) enif_free(tokens);
        ErlNifBinary empty;
        if (!enif_alloc_binary(0, &empty)) {
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        return enif_make_binary(env, &empty);
    }

    pthread_mutex_lock(&m->mu);
    if (!m->model || m->release_pending) {
        pthread_mutex_unlock(&m->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    const struct llama_vocab *vocab = erllama_safe_model_get_vocab(m->model);
    if (!vocab) {
        pthread_mutex_unlock(&m->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    int32_t n_vocab = erllama_safe_vocab_n_tokens(vocab);
    /* Fail closed if the vocab lookup gave us no usable size: without
     * n_vocab we cannot validate token IDs, and an out-of-range
     * positive ID would reach `id_to_token.at(id)` deep inside llama
     * and throw across the C ABI. Mirrors the prefill path. */
    if (n_vocab <= 0) {
        pthread_mutex_unlock(&m->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_invalid_token);
    }
    /* Validate before any token_to_piece call so out-of-range IDs do
     * not reach `id_to_token.at(id)` and trigger an internal throw. */
    for (int32_t i = 0; i < n; i++) {
        if (tokens[i] >= n_vocab) {
            pthread_mutex_unlock(&m->mu);
            enif_free(tokens);
            return enif_make_tuple2(env, atom_error, atom_invalid_token);
        }
    }

    /* Per-token piece, concatenated. Pieces are typically a handful
     * of bytes; we grow the buffer on demand and re-call
     * llama_token_to_piece with a sized buffer when 256 bytes isn't
     * enough (it returns the negative needed size). The safe wrapper
     * returns INT32_MIN on a thrown C++ exception. */
    char small_piece[256];
    /* Guard the size computation: clamp n to a sane upper bound so
     * gcc's range analysis can prove cap fits. 16M tokens is far
     * beyond any realistic prompt; reject earlier rather than
     * overflow. */
    if (n < 0 || n > (1 << 24)) {
        pthread_mutex_unlock(&m->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_too_large);
    }
    size_t cap = (size_t) n * 32u + 16u;
    char *out = enif_alloc(cap);
    if (!out) {
        pthread_mutex_unlock(&m->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    size_t used = 0;
    int err = 0;
    for (int32_t i = 0; i < n; i++) {
        char *piece_buf = small_piece;
        int32_t buf_size = (int32_t) sizeof(small_piece);
        char *grown = NULL;
        int32_t got = erllama_safe_token_to_piece(
            vocab, tokens[i], piece_buf, buf_size, 0, false);
        if (got == INT32_MIN) {
            err = 1;
            break;
        }
        if (got < 0) {
            int32_t need = -got;
            if (need <= 0 || need > (1 << 20)) {
                err = 1;
                break;
            }
            grown = enif_alloc((size_t) need);
            if (!grown) { err = 2; break; }
            piece_buf = grown;
            got = erllama_safe_token_to_piece(
                vocab, tokens[i], piece_buf, need, 0, false);
            if (got == INT32_MIN || got < 0) {
                enif_free(grown);
                err = 1;
                break;
            }
        }
        if (used + (size_t) got > cap) {
            size_t new_cap = (used + (size_t) got) * 2 + 16;
            char *new_out = enif_realloc(out, new_cap);
            if (!new_out) {
                if (grown) enif_free(grown);
                err = 2;
                break;
            }
            out = new_out;
            cap = new_cap;
        }
        memcpy(out + used, piece_buf, (size_t) got);
        used += (size_t) got;
        if (grown) enif_free(grown);
    }
    pthread_mutex_unlock(&m->mu);
    enif_free(tokens);
    if (err) {
        enif_free(out);
        if (err == 2) return enif_make_tuple2(env, atom_error, atom_oom);
        return enif_make_tuple2(env, atom_error, atom_invalid_token);
    }

    ErlNifBinary outbin;
    if (!enif_alloc_binary(used, &outbin)) {
        enif_free(out);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    memcpy(outbin.data, out, used);
    enif_free(out);
    return enif_make_binary(env, &outbin);
}

/* =========================================================================
 * fsync_dir (existing)
 * ========================================================================= */

static ERL_NIF_TERM make_errno_atom(ErlNifEnv *env, int e) {
    const char *name;
    switch (e) {
        case EACCES:    name = "eacces";    break;
        case EBUSY:     name = "ebusy";     break;
        case EEXIST:    name = "eexist";    break;
        case EINVAL:    name = "einval";    break;
        case EIO:       name = "eio";       break;
        case EISDIR:    name = "eisdir";    break;
        case ELOOP:     name = "eloop";     break;
        case EMFILE:    name = "emfile";    break;
        case ENAMETOOLONG: name = "enametoolong"; break;
        case ENFILE:    name = "enfile";    break;
        case ENOENT:    name = "enoent";    break;
        case ENOMEM:    name = "enomem";    break;
        case ENOSPC:    name = "enospc";    break;
        case ENOTDIR:   name = "enotdir";   break;
        case EPERM:     name = "eperm";     break;
        case EROFS:     name = "erofs";     break;
#ifdef EINTEGRITY
        /* FreeBSD fsync(2) returns EINTEGRITY on filesystem
         * integrity errors (ZFS checksum failure, ufs2 sb
         * mismatch). Surface it instead of mapping to "unknown". */
        case EINTEGRITY: name = "eintegrity"; break;
#endif
        default:        name = "unknown";   break;
    }
    return enif_make_atom(env, name);
}

static ERL_NIF_TERM nif_fsync_dir(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
    (void) argc;
    char path[4097];
    /* copy_path rejects empty inputs, oversize inputs, and embedded
     * NUL bytes (which would otherwise let `<<"a\0b">>` be passed to
     * open() as just "a"). */
    if (!copy_path(env, argv[0], path, sizeof(path))) {
        return enif_make_badarg(env);
    }
    int fd = open(path, O_RDONLY);
    if (fd < 0) {
        return enif_make_tuple2(env, atom_error, make_errno_atom(env, errno));
    }
    int rc = fsync(fd);
    int saved = errno;
    close(fd);
    if (rc != 0) {
        return enif_make_tuple2(env, atom_error, make_errno_atom(env, saved));
    }
    return atom_ok;
}

/* =========================================================================
 * Chat templating
 * =========================================================================
 *
 * nif_apply_chat_template renders a normalised chat request through
 * the model's chat template (read from GGUF metadata) and tokenises
 * the result. The Request map carries:
 *
 *   #{ messages := [#{role := binary(), content := binary()}]
 *    , system   => binary() | undefined
 *    , tools    => [#{name := binary(), description => binary(),
 *                     schema => map()}] | undefined
 *    }
 *
 * `tools` are inlined as a synthetic system addendum because
 * llama_chat_apply_template does not take a tools field. Models that
 * embed tool definitions in their template (llama-3.1, hermes-2,
 * qwen2.5) read them from the system block.
 */

/* Pull a binary value out of `Map[Key]`. Returns 1 with `bin` filled
 * on success, 0 if the key is missing or not a binary. The returned
 * `bin` points into a process-owned region; copy before unlocking
 * any cross-call resource. */
static int get_map_bin(ErlNifEnv *env, ERL_NIF_TERM map, const char *key,
                      ErlNifBinary *bin) {
    ERL_NIF_TERM v;
    ERL_NIF_TERM k = enif_make_atom(env, key);
    if (!enif_get_map_value(env, map, k, &v)) return 0;
    if (!enif_inspect_iolist_as_binary(env, v, bin)) return 0;
    return 1;
}

/* Iterate over a list of message maps and fill `out_msgs` with
 * llama_chat_message structs. Each message is `#{role := ..., content := ...}`.
 * The role and content strings are allocated with enif_alloc and the
 * caller must free them via free_chat_msgs.
 *
 * Returns the number of messages on success, -1 on bad input, -2 on OOM.
 */
static int build_chat_msgs_from_list(
    ErlNifEnv *env, ERL_NIF_TERM list,
    struct llama_chat_message *out, int max_out, int idx0
) {
    int idx = idx0;
    ERL_NIF_TERM head, tail = list;
    while (enif_get_list_cell(env, tail, &head, &tail)) {
        if (idx >= max_out) return -1;
        if (!enif_is_map(env, head)) return -1;
        ErlNifBinary role_bin, content_bin;
        if (!get_map_bin(env, head, "role", &role_bin)) return -1;
        if (!get_map_bin(env, head, "content", &content_bin)) return -1;
        char *role = enif_alloc(role_bin.size + 1);
        if (!role) return -2;
        memcpy(role, role_bin.data, role_bin.size);
        role[role_bin.size] = '\0';
        char *content = enif_alloc(content_bin.size + 1);
        if (!content) {
            enif_free(role);
            return -2;
        }
        memcpy(content, content_bin.data, content_bin.size);
        content[content_bin.size] = '\0';
        out[idx].role = role;
        out[idx].content = content;
        idx++;
    }
    return idx;
}

static void free_chat_msgs(struct llama_chat_message *msgs, int n) {
    for (int i = 0; i < n; i++) {
        if (msgs[i].role) enif_free((char *) msgs[i].role);
        if (msgs[i].content) enif_free((char *) msgs[i].content);
    }
}

/* Build a synthetic system content string that prepends the user-
 * supplied system text and renders tools as a textual list, so models
 * whose chat templates honour tool definitions in the system block
 * (llama-3.1+, hermes-2-pro, qwen2.5) see them. Caller frees with
 * enif_free.
 *
 * Returns the malloced string or NULL on OOM. `*out_len` is set to
 * the strlen for convenience. */
static char *build_system_content(ErlNifEnv *env, ERL_NIF_TERM request_map,
                                  size_t *out_len) {
    ErlNifBinary system_bin = {0};
    int has_system = get_map_bin(env, request_map, "system", &system_bin);

    ERL_NIF_TERM tools_term;
    int has_tools =
        enif_get_map_value(env, request_map, enif_make_atom(env, "tools"),
                           &tools_term)
        && enif_is_list(env, tools_term);

    if (!has_system && !has_tools) {
        if (out_len) *out_len = 0;
        return NULL;
    }

    /* Render: `<system>\n\nAvailable tools:\n  - name: description\n...` */
    size_t cap = 256;
    if (has_system) cap += system_bin.size;
    char *buf = enif_alloc(cap);
    if (!buf) return NULL;
    size_t pos = 0;
    if (has_system) {
        memcpy(buf + pos, system_bin.data, system_bin.size);
        pos += system_bin.size;
    }
    if (has_tools) {
        const char *header = (has_system ? "\n\nAvailable tools:\n" :
                                            "Available tools:\n");
        size_t header_len = strlen(header);
        if (pos + header_len + 1 > cap) {
            cap = (pos + header_len + 1) * 2;
            char *nbuf = enif_realloc(buf, cap);
            if (!nbuf) { enif_free(buf); return NULL; }
            buf = nbuf;
        }
        memcpy(buf + pos, header, header_len);
        pos += header_len;
        ERL_NIF_TERM head, tail = tools_term;
        while (enif_get_list_cell(env, tail, &head, &tail)) {
            if (!enif_is_map(env, head)) continue;
            ErlNifBinary name_bin, desc_bin;
            if (!get_map_bin(env, head, "name", &name_bin)) continue;
            int has_desc = get_map_bin(env, head, "description", &desc_bin);
            size_t needed = 4 + name_bin.size + 2 +
                             (has_desc ? desc_bin.size : 0) + 1;
            if (pos + needed + 1 > cap) {
                cap = (pos + needed + 1) * 2;
                char *nbuf = enif_realloc(buf, cap);
                if (!nbuf) { enif_free(buf); return NULL; }
                buf = nbuf;
            }
            memcpy(buf + pos, "  - ", 4); pos += 4;
            memcpy(buf + pos, name_bin.data, name_bin.size); pos += name_bin.size;
            if (has_desc) {
                memcpy(buf + pos, ": ", 2); pos += 2;
                memcpy(buf + pos, desc_bin.data, desc_bin.size); pos += desc_bin.size;
            }
            buf[pos++] = '\n';
        }
    }
    buf[pos] = '\0';
    if (out_len) *out_len = pos;
    return buf;
}

static ERL_NIF_TERM nif_apply_chat_template(ErlNifEnv *env, int argc,
                                            const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    if (!enif_is_map(env, argv[1])) {
        return enif_make_badarg(env);
    }

    /* Read the messages list from the request. */
    ERL_NIF_TERM messages_term;
    if (!enif_get_map_value(env, argv[1],
                            enif_make_atom(env, "messages"), &messages_term)
        || !enif_is_list(env, messages_term)) {
        return enif_make_badarg(env);
    }
    unsigned msg_len;
    if (!enif_get_list_length(env, messages_term, &msg_len)) {
        return enif_make_badarg(env);
    }

    /* +1 for an optional synthetic system message at the front. */
    int max_msgs = (int) msg_len + 1;
    struct llama_chat_message *msgs =
        enif_alloc(sizeof(struct llama_chat_message) * (size_t) max_msgs);
    if (!msgs) return enif_make_tuple2(env, atom_error, atom_oom);
    memset(msgs, 0, sizeof(struct llama_chat_message) * (size_t) max_msgs);

    int n_msgs = 0;
    char *synthetic_system = build_system_content(env, argv[1], NULL);
    if (synthetic_system) {
        char *role = enif_alloc(7);
        if (!role) {
            enif_free(synthetic_system);
            enif_free(msgs);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        memcpy(role, "system", 7);
        msgs[0].role = role;
        msgs[0].content = synthetic_system;
        n_msgs = 1;
    }

    int built = build_chat_msgs_from_list(
        env, messages_term, msgs, max_msgs, n_msgs);
    if (built < 0) {
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        return built == -2
                   ? enif_make_tuple2(env, atom_error, atom_oom)
                   : enif_make_badarg(env);
    }
    n_msgs = built;

    pthread_mutex_lock(&m->mu);
    if (!m->model || m->release_pending) {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    const char *tmpl = erllama_safe_model_chat_template(m->model, NULL);
    if (!tmpl || tmpl[0] == '\0') {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        return enif_make_tuple2(env, atom_error, atom_no_template);
    }

    /* Render. Start with a 4 KiB buffer; grow on negative-needed-size. */
    int32_t buf_size = 4096;
    char *buf = enif_alloc((size_t) buf_size);
    if (!buf) {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    int32_t written = erllama_safe_chat_apply_template(
        tmpl, msgs, (size_t) n_msgs, true, buf, buf_size);
    if (written < 0 && written != INT32_MIN) {
        int32_t needed = -written;
        if (needed > (int32_t) ERLLAMA_MAX_TOKEN_TEXT) {
            pthread_mutex_unlock(&m->mu);
            free_chat_msgs(msgs, n_msgs);
            enif_free(msgs);
            enif_free(buf);
            return enif_make_tuple2(env, atom_error, atom_too_large);
        }
        enif_free(buf);
        buf_size = needed + 16;
        buf = enif_alloc((size_t) buf_size);
        if (!buf) {
            pthread_mutex_unlock(&m->mu);
            free_chat_msgs(msgs, n_msgs);
            enif_free(msgs);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        written = erllama_safe_chat_apply_template(
            tmpl, msgs, (size_t) n_msgs, true, buf, buf_size);
    }
    if (written < 0) {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        enif_free(buf);
        return enif_make_tuple2(env, atom_error,
                                written == INT32_MIN ? atom_exception
                                                     : atom_template_failed);
    }

    /* Tokenise the rendered string. parse_special=true so chat-template
     * tokens (`<|user|>`, `<|im_start|>`, etc.) become their special
     * token ids rather than text fragments. */
    const struct llama_vocab *vocab = erllama_safe_model_get_vocab(m->model);
    if (!vocab) {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        enif_free(buf);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }

    int32_t n_max = written + 8;
    if (n_max < 16) n_max = 16;
    if (n_max > ERLLAMA_MAX_TOKENS) n_max = ERLLAMA_MAX_TOKENS;
    llama_token *tokens = enif_alloc(sizeof(llama_token) * (size_t) n_max);
    if (!tokens) {
        pthread_mutex_unlock(&m->mu);
        free_chat_msgs(msgs, n_msgs);
        enif_free(msgs);
        enif_free(buf);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    int32_t n = erllama_safe_tokenize(vocab, buf, written, tokens, n_max,
                                      true, true);
    if (n < 0 && n != INT32_MIN) {
        int32_t needed = -n;
        if (needed > ERLLAMA_MAX_TOKENS) {
            pthread_mutex_unlock(&m->mu);
            free_chat_msgs(msgs, n_msgs);
            enif_free(msgs);
            enif_free(buf);
            enif_free(tokens);
            return enif_make_tuple2(env, atom_error, atom_too_large);
        }
        enif_free(tokens);
        tokens = enif_alloc(sizeof(llama_token) * (size_t) needed);
        if (!tokens) {
            pthread_mutex_unlock(&m->mu);
            free_chat_msgs(msgs, n_msgs);
            enif_free(msgs);
            enif_free(buf);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
        n = erllama_safe_tokenize(vocab, buf, written, tokens, needed,
                                  true, true);
    }
    pthread_mutex_unlock(&m->mu);
    free_chat_msgs(msgs, n_msgs);
    enif_free(msgs);
    enif_free(buf);
    if (n == INT32_MIN) {
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (n < 0) {
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_tokenize_failed);
    }

    ERL_NIF_TERM list = enif_make_list(env, 0);
    for (int32_t i = n - 1; i >= 0; i--) {
        list = enif_make_list_cell(env, enif_make_int(env, tokens[i]), list);
    }
    enif_free(tokens);
    return enif_make_tuple2(env, atom_ok, list);
}

/* =========================================================================
 * Embeddings
 * =========================================================================
 *
 * Decodes a token list with the embeddings flag flipped on, then
 * reads the per-sequence pooled vector via llama_get_embeddings_seq.
 * Falls back to llama_get_embeddings (last-token) for models whose
 * pooling_type is NONE. The context must have been opened with
 * embeddings = true at new_context/2 time, otherwise the underlying
 * llama_decode allocates causal-LM logits buffers and the
 * embeddings reads return NULL.
 */
static ERL_NIF_TERM nif_embed(ErlNifEnv *env, int argc,
                              const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    llama_token *tokens = NULL;
    int32_t n = 0;
    int rc = read_token_list(env, argv[1], &tokens, &n);
    if (rc != 1) return token_list_error(env, rc);
    if (n == 0) {
        if (tokens) enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_invalid_token);
    }

    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    const struct llama_model *model = erllama_safe_get_model(c->ctx);
    const struct llama_vocab *vocab =
        model ? erllama_safe_model_get_vocab(model) : NULL;
    int32_t n_vocab = vocab ? erllama_safe_vocab_n_tokens(vocab) : 0;
    if (n_vocab <= 0) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    for (int32_t i = 0; i < n; i++) {
        if (tokens[i] >= n_vocab) {
            pthread_mutex_unlock(&c->mu);
            enif_free(tokens);
            return enif_make_tuple2(env, atom_error, atom_invalid_token);
        }
    }
    int32_t n_embd = erllama_safe_n_embd(model);
    if (n_embd <= 0) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_embed_failed);
    }

    /* Flip on embeddings for this call; the caller may have left it
     * off for normal causal-lm decode. We do not flip it back here -
     * the next decode_one call would read garbage logits. The model
     * layer is responsible for using a dedicated context for
     * embeddings, or for arranging not to mix modes on the same ctx. */
    if (erllama_safe_set_embeddings(c->ctx, true) != 0) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }

    struct llama_batch batch = llama_batch_get_one(tokens, n);
    int dr = erllama_safe_decode(c->ctx, batch);
    if (dr == ERLLAMA_DECODE_EXC_SENTINEL) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    if (dr != 0) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, enif_make_int(env, dr));
    }
    /* The `decode_ready` flag implies "logits are ready for sampling";
     * after an embeddings decode the logits buffer is repurposed and a
     * follow-on decode_one would crash. Force it off so the model
     * layer must explicitly re-prefill before sampling. */
    c->decode_ready = 0;

    /* Try the pooled vector first; fall back to last-token. */
    float *embd = erllama_safe_get_embeddings_seq(c->ctx, 0);
    if (!embd) {
        embd = erllama_safe_get_embeddings(c->ctx);
    }
    if (!embd) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_embed_failed);
    }

    /* Copy the floats out of the context-owned buffer before unlocking. */
    double *vec = enif_alloc(sizeof(double) * (size_t) n_embd);
    if (!vec) {
        pthread_mutex_unlock(&c->mu);
        enif_free(tokens);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    for (int32_t i = 0; i < n_embd; i++) vec[i] = (double) embd[i];
    pthread_mutex_unlock(&c->mu);
    enif_free(tokens);

    ERL_NIF_TERM list = enif_make_list(env, 0);
    for (int32_t i = n_embd - 1; i >= 0; i--) {
        list = enif_make_list_cell(env, enif_make_double(env, vec[i]), list);
    }
    enif_free(vec);
    return enif_make_tuple2(env, atom_ok, list);
}

/* =========================================================================
 * Sampler config
 *
 * configure_sampler/2 is the one entry point that builds the per-context
 * sampler chain. It accepts a config map carrying any of: grammar,
 * repetition_penalty, top_k, top_p, min_p, temperature, seed. Missing
 * fields are skipped; a temperature of 0.0 (or no sampling params at
 * all) ends the chain in greedy.
 *
 * set_grammar/2 is a backwards-compatible alias that builds the same
 * chain with only a grammar entry. clear_sampler/1 drops the cached
 * chain so the next decode_one lazy-inits greedy.
 * ========================================================================= */

static struct llama_sampler *build_default_greedy_chain(void) {
    struct llama_sampler_chain_params sp =
        llama_sampler_chain_default_params();
    struct llama_sampler *chain = erllama_safe_sampler_chain_init(sp);
    if (!chain) return NULL;
    struct llama_sampler *greedy = erllama_safe_sampler_init_greedy();
    if (!greedy) {
        (void) erllama_safe_sampler_free(chain);
        return NULL;
    }
    if (erllama_safe_sampler_chain_add(chain, greedy) != 0) {
        (void) erllama_safe_sampler_free(greedy);
        (void) erllama_safe_sampler_free(chain);
        return NULL;
    }
    return chain;
}

/* Append one stage to a chain, freeing the chain and returning NULL on
 * failure so callers can write a tight cleanup ladder. */
static int chain_append(struct llama_sampler *chain,
                        struct llama_sampler *stage) {
    if (!stage) return -1;
    if (erllama_safe_sampler_chain_add(chain, stage) != 0) {
        (void) erllama_safe_sampler_free(stage);
        return -1;
    }
    return 0;
}

/* Build a sampler chain from a config map. On failure returns NULL and
 * sets *out_err_atom to one of: atom_oom, atom_grammar_failed,
 * atom_badarg. The lock must already be held by the caller (vocab
 * lookup uses c->ctx). */
static struct llama_sampler *
build_sampler_chain_from_map(ErlNifEnv *env, ERL_NIF_TERM cfg,
                             struct llama_context *ctx,
                             ERL_NIF_TERM *out_err_atom) {
    if (!enif_is_map(env, cfg)) {
        *out_err_atom = enif_make_atom(env, "badarg");
        return NULL;
    }

    /* Grammar requires the vocab; everything else does not. */
    ErlNifBinary grammar_bin;
    int has_grammar = 0;
    {
        ERL_NIF_TERM v;
        if (enif_get_map_value(env, cfg, enif_make_atom(env, "grammar"), &v)) {
            if (!enif_inspect_iolist_as_binary(env, v, &grammar_bin) ||
                grammar_bin.size == 0) {
                *out_err_atom = enif_make_atom(env, "badarg");
                return NULL;
            }
            has_grammar = 1;
        }
    }

    int32_t i32;
    double f64;
    int has_top_k = get_map_int31(env, cfg, "top_k", &i32);
    int32_t top_k_val = has_top_k ? i32 : 0;

    int has_top_p = get_map_double(env, cfg, "top_p", &f64);
    double top_p_val = has_top_p ? f64 : 1.0;

    int has_min_p = get_map_double(env, cfg, "min_p", &f64);
    double min_p_val = has_min_p ? f64 : 0.0;

    int has_temp = get_map_double(env, cfg, "temperature", &f64);
    double temp_val = has_temp ? f64 : 0.0;

    int has_rep = get_map_double(env, cfg, "repetition_penalty", &f64);
    double rep_val = has_rep ? f64 : 1.0;

    uint32_t seed_val = 0;
    int has_seed = 0;
    {
        ERL_NIF_TERM v;
        if (enif_get_map_value(env, cfg, enif_make_atom(env, "seed"), &v)) {
            unsigned long seed_ul;
            if (!enif_get_ulong(env, v, &seed_ul)) {
                *out_err_atom = enif_make_atom(env, "badarg");
                return NULL;
            }
            seed_val = (uint32_t) seed_ul;
            has_seed = 1;
        }
    }

    struct llama_sampler_chain_params sp =
        llama_sampler_chain_default_params();
    struct llama_sampler *chain = erllama_safe_sampler_chain_init(sp);
    if (!chain) {
        *out_err_atom = atom_oom;
        return NULL;
    }

    if (has_grammar) {
        const struct llama_model *model = erllama_safe_get_model(ctx);
        const struct llama_vocab *vocab =
            model ? erllama_safe_model_get_vocab(model) : NULL;
        if (!vocab) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_exception;
            return NULL;
        }
        char *gstr = enif_alloc(grammar_bin.size + 1);
        if (!gstr) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
        memcpy(gstr, grammar_bin.data, grammar_bin.size);
        gstr[grammar_bin.size] = '\0';
        struct llama_sampler *g =
            erllama_safe_sampler_init_grammar(vocab, gstr, "root");
        enif_free(gstr);
        if (!g) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_grammar_failed;
            return NULL;
        }
        if (chain_append(chain, g) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    }

    if (has_rep && rep_val != 1.0) {
        if (chain_append(chain,
                         erllama_safe_sampler_init_penalties(
                             64, (float) rep_val, 0.0f, 0.0f)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    }
    if (has_top_k && top_k_val > 0) {
        if (chain_append(chain,
                         erllama_safe_sampler_init_top_k(top_k_val)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    }
    if (has_top_p && top_p_val < 1.0) {
        if (chain_append(chain,
                         erllama_safe_sampler_init_top_p((float) top_p_val,
                                                         1)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    }
    if (has_min_p && min_p_val > 0.0) {
        if (chain_append(chain,
                         erllama_safe_sampler_init_min_p((float) min_p_val,
                                                         1)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    }
    if (has_temp && temp_val > 0.0) {
        if (chain_append(chain,
                         erllama_safe_sampler_init_temp((float) temp_val)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
        if (chain_append(chain,
                         erllama_safe_sampler_init_dist(seed_val)) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
    } else {
        /* temperature == 0 or absent: greedy terminal. */
        if (chain_append(chain, erllama_safe_sampler_init_greedy()) != 0) {
            (void) erllama_safe_sampler_free(chain);
            *out_err_atom = atom_oom;
            return NULL;
        }
        (void) has_seed; /* seed without temperature is a no-op. */
    }

    return chain;
}

static ERL_NIF_TERM nif_configure_sampler(ErlNifEnv *env, int argc,
                                          const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    if (!enif_is_map(env, argv[1])) {
        return enif_make_badarg(env);
    }

    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    ERL_NIF_TERM err = atom_oom;
    struct llama_sampler *chain =
        build_sampler_chain_from_map(env, argv[1], c->ctx, &err);
    if (!chain) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, err);
    }
    if (c->smpl) {
        (void) erllama_safe_sampler_free(c->smpl);
    }
    c->smpl = chain;
    pthread_mutex_unlock(&c->mu);
    return atom_ok;
}

/* Backwards-compatible: builds a chain with only a grammar entry. */
static ERL_NIF_TERM nif_set_grammar(ErlNifEnv *env, int argc,
                                    const ERL_NIF_TERM argv[]) {
    (void) argc;
    ERL_NIF_TERM cfg = enif_make_new_map(env);
    enif_make_map_put(env, cfg, enif_make_atom(env, "grammar"), argv[1], &cfg);
    ERL_NIF_TERM new_argv[2] = {argv[0], cfg};
    return nif_configure_sampler(env, 2, new_argv);
}

static ERL_NIF_TERM nif_clear_sampler(ErlNifEnv *env, int argc,
                                      const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (c->smpl) {
        (void) erllama_safe_sampler_free(c->smpl);
        c->smpl = NULL;
    }
    pthread_mutex_unlock(&c->mu);
    return atom_ok;
}

/* =========================================================================
 * LoRA adapters
 * ========================================================================= */

static ERL_NIF_TERM nif_adapter_load(ErlNifEnv *env, int argc,
                                     const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_model_t *m;
    if (!enif_get_resource(env, argv[0], MODEL_RT, (void **) &m)) {
        return enif_make_badarg(env);
    }
    char path[4097];
    if (!copy_path(env, argv[1], path, sizeof(path))) {
        return enif_make_badarg(env);
    }

    pthread_mutex_lock(&m->mu);
    if (!m->model) {
        pthread_mutex_unlock(&m->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    struct llama_adapter_lora *adapter =
        erllama_safe_adapter_lora_init(m->model, path);
    pthread_mutex_unlock(&m->mu);
    if (!adapter) {
        return enif_make_tuple2(env, atom_error, atom_load_failed);
    }

    erllama_adapter_t *res =
        enif_alloc_resource(ADAPTER_RT, sizeof(*res));
    if (!res) {
        erllama_safe_adapter_lora_free(adapter);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    memset(res, 0, sizeof(*res));
    if (pthread_mutex_init(&res->mu, NULL) != 0) {
        enif_release_resource(res);
        erllama_safe_adapter_lora_free(adapter);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    res->mu_inited = 1;
    res->adapter = adapter;
    res->model_res = m;
    enif_keep_resource(m);

    ERL_NIF_TERM term = enif_make_resource(env, res);
    enif_release_resource(res);
    return enif_make_tuple2(env, atom_ok, term);
}

static ERL_NIF_TERM nif_adapter_free(ErlNifEnv *env, int argc,
                                     const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_adapter_t *a;
    if (!enif_get_resource(env, argv[0], ADAPTER_RT, (void **) &a)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&a->mu);
    if (!a->adapter) {
        pthread_mutex_unlock(&a->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    erllama_safe_adapter_lora_free(a->adapter);
    a->adapter = NULL;
    pthread_mutex_unlock(&a->mu);
    return atom_ok;
}

/* Install a set of adapters with scales on a context. Takes a list of
 * {AdapterRes, Scale} pairs; an empty list detaches everything.
 * The model layer is responsible for tracking the current attachment
 * set; this NIF just plumbs through to llama_set_adapters_lora. */
static ERL_NIF_TERM nif_set_adapters(ErlNifEnv *env, int argc,
                                     const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    ERL_NIF_TERM list = argv[1];
    unsigned n;
    if (!enif_get_list_length(env, list, &n)) {
        return enif_make_badarg(env);
    }

    struct llama_adapter_lora **adapters = NULL;
    float *scales = NULL;
    if (n > 0) {
        adapters = enif_alloc(sizeof(*adapters) * n);
        scales = enif_alloc(sizeof(*scales) * n);
        if (!adapters || !scales) {
            if (adapters) enif_free(adapters);
            if (scales) enif_free(scales);
            return enif_make_tuple2(env, atom_error, atom_oom);
        }
    }

    ERL_NIF_TERM head, tail = list;
    unsigned i = 0;
    while (enif_get_list_cell(env, tail, &head, &tail)) {
        int arity;
        const ERL_NIF_TERM *pair;
        if (!enif_get_tuple(env, head, &arity, &pair) || arity != 2) {
            if (adapters) enif_free(adapters);
            if (scales) enif_free(scales);
            return enif_make_badarg(env);
        }
        erllama_adapter_t *a;
        if (!enif_get_resource(env, pair[0], ADAPTER_RT, (void **) &a)) {
            if (adapters) enif_free(adapters);
            if (scales) enif_free(scales);
            return enif_make_badarg(env);
        }
        double scale;
        if (!enif_get_double(env, pair[1], &scale)) {
            long ll;
            if (enif_get_long(env, pair[1], &ll)) {
                scale = (double) ll;
            } else {
                if (adapters) enif_free(adapters);
                if (scales) enif_free(scales);
                return enif_make_badarg(env);
            }
        }
        pthread_mutex_lock(&a->mu);
        if (!a->adapter) {
            pthread_mutex_unlock(&a->mu);
            if (adapters) enif_free(adapters);
            if (scales) enif_free(scales);
            return enif_make_tuple2(env, atom_error, atom_released);
        }
        adapters[i] = a->adapter;
        scales[i] = (float) scale;
        pthread_mutex_unlock(&a->mu);
        i++;
    }

    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        if (adapters) enif_free(adapters);
        if (scales) enif_free(scales);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    int rc = erllama_safe_set_adapters_lora(c->ctx, adapters, n, scales);
    pthread_mutex_unlock(&c->mu);

    if (adapters) enif_free(adapters);
    if (scales) enif_free(scales);

    if (rc != 0) {
        return enif_make_tuple2(env, atom_error, atom_exception);
    }
    return atom_ok;
}

/* =========================================================================
 * Per-request sampler resource (Phase 4 infrastructure)
 *
 * Wraps a llama_sampler_chain built by build_sampler_chain_from_map
 * so multiple in-flight requests (v0.2+) can hold independent chains.
 * The v0.1 model layer still uses configure_sampler/2 against the
 * context's cached `c->smpl`; this resource is the building block
 * for the eventual decode_and_sample_batch NIF.
 * ========================================================================= */

static ERL_NIF_TERM nif_sampler_new(ErlNifEnv *env, int argc,
                                    const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_context_t *c;
    if (!enif_get_resource(env, argv[0], CTX_RT, (void **) &c)) {
        return enif_make_badarg(env);
    }
    if (!enif_is_map(env, argv[1])) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&c->mu);
    if (!c->ctx) {
        pthread_mutex_unlock(&c->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    ERL_NIF_TERM err = atom_oom;
    struct llama_sampler *chain =
        build_sampler_chain_from_map(env, argv[1], c->ctx, &err);
    pthread_mutex_unlock(&c->mu);
    if (!chain) {
        return enif_make_tuple2(env, atom_error, err);
    }
    erllama_sampler_t *res = enif_alloc_resource(SAMPLER_RT, sizeof(*res));
    if (!res) {
        (void) erllama_safe_sampler_free(chain);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    memset(res, 0, sizeof(*res));
    if (pthread_mutex_init(&res->mu, NULL) != 0) {
        enif_release_resource(res);
        (void) erllama_safe_sampler_free(chain);
        return enif_make_tuple2(env, atom_error, atom_oom);
    }
    res->mu_inited = 1;
    res->chain = chain;
    res->ctx_res = c;
    enif_keep_resource(c);

    ERL_NIF_TERM term = enif_make_resource(env, res);
    enif_release_resource(res);
    return enif_make_tuple2(env, atom_ok, term);
}

static ERL_NIF_TERM nif_sampler_free(ErlNifEnv *env, int argc,
                                     const ERL_NIF_TERM argv[]) {
    (void) argc;
    erllama_sampler_t *s;
    if (!enif_get_resource(env, argv[0], SAMPLER_RT, (void **) &s)) {
        return enif_make_badarg(env);
    }
    pthread_mutex_lock(&s->mu);
    if (!s->chain) {
        pthread_mutex_unlock(&s->mu);
        return enif_make_tuple2(env, atom_error, atom_released);
    }
    (void) erllama_safe_sampler_free(s->chain);
    s->chain = NULL;
    pthread_mutex_unlock(&s->mu);
    return atom_ok;
}

static ErlNifFunc nif_funcs[] = {
    {"nif_crc32c",       1, nif_crc32c,       ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_kv_pack",      3, nif_kv_pack,      ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_kv_pack",      4, nif_kv_pack,      ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_kv_unpack",    3, nif_kv_unpack,    ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_kv_seq_rm",    4, nif_kv_seq_rm,    ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_fsync_dir",    1, nif_fsync_dir,    ERL_NIF_DIRTY_JOB_IO_BOUND},
    {"nif_load_model",   2, nif_load_model,   ERL_NIF_DIRTY_JOB_IO_BOUND},
    {"nif_free_model",   1, nif_free_model,   ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_new_context",  2, nif_new_context,  ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_free_context", 1, nif_free_context, ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_tokenize",     3, nif_tokenize,     ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_prefill",      2, nif_prefill,      ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_decode_one",   1, nif_decode_one,   ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_detokenize",   2, nif_detokenize,   ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_apply_chat_template", 2, nif_apply_chat_template, ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_embed",        2, nif_embed,        ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_set_grammar",  2, nif_set_grammar,  ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_configure_sampler", 2, nif_configure_sampler,
        ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_clear_sampler", 1, nif_clear_sampler, ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_adapter_load", 2, nif_adapter_load, ERL_NIF_DIRTY_JOB_IO_BOUND},
    {"nif_adapter_free", 1, nif_adapter_free, ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_set_adapters", 2, nif_set_adapters, ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_sampler_new",  2, nif_sampler_new,  ERL_NIF_DIRTY_JOB_CPU_BOUND},
    {"nif_sampler_free", 1, nif_sampler_free, ERL_NIF_DIRTY_JOB_CPU_BOUND}
};

ERL_NIF_INIT(erllama_nif, nif_funcs, load, NULL, NULL, unload)