// vim:ts=2:sw=2:et
//-----------------------------------------------------------------------------
// CSV decoding/encoding (RFC 4180-style), producing/consuming native Erlang
// terms in a single pass.
//
// Decoding:
// - Without `headers`: a list of rows, each row a list of binary fields.
// - With `headers`: the first row becomes the column names, and each
// subsequent row decodes to a map (or proplist) keyed by those names.
//
// Encoding accepts the same shapes: a list of rows (each row a list of
// binaries/atoms/integers/floats), or — with `headers` — a list of maps
// whose values are written out in the given column order.
//
// Quoting follows RFC 4180: a field is quoted if it contains the delimiter,
// a quote character, or a line break; embedded quotes are doubled.
//-----------------------------------------------------------------------------
#pragma once
#include <cctype>
#include <cmath>
#include <charconv>
#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
#include <erl_nif.h>
#include "fast_float.hpp"
#include "glazer_atoms.hpp"
#include "glazer_bigint.hpp"
#include "glazer_common.hpp"
namespace glz {
//-----------------------------------------------------------------------------
// Per-column field type conversion (`{fields, [Type, ...]}` decode option)
//-----------------------------------------------------------------------------
enum class CSVFieldType {
none, // leave as binary (default)
integer,
floatp, // {float, Precision}
boolean,
datetime, // {datetime, InputFormat}
binary,
charlist,
existing_atom,
atom_list, // {atom, ExistingAtoms}
};
// What to do with a field that fails to convert to the requested type.
enum class CSVOnFailure {
binary, // leave the original binary (default)
raise, // fail the whole decode with {invalid_field_value, Row, Col}
default_value, // use CSVFieldSpec::default_term
null, // use the configured null term (am_null)
};
struct CSVFieldSpec {
CSVFieldType type = CSVFieldType::none;
int precision = 0; // for floatp
int64_t scale = 1; // for floatp
std::string format; // for datetime (strptime-like)
std::vector<std::string> atoms; // for {atom, ExistingAtoms}
CSVOnFailure on_failure = CSVOnFailure::binary;
ERL_NIF_TERM default_term = 0; // for empty fields, or on_failure == default_value
bool has_default = false;
};
//-----------------------------------------------------------------------------
// Options
//-----------------------------------------------------------------------------
enum class CSVReturnKind { list, map, tuple };
struct CSVDecodeOpts {
char delimiter = ',';
bool headers = false; // bare `headers` atom or {headers, ...}
bool hdr_atom = false; // {headers, atom}
bool hdr_existing_atom = false; // {headers, existing_atom}
bool hdr_charlist = false; // {headers, charlist}
CSVReturnKind return_kind = CSVReturnKind::list; // {return, list | map | tuple}
bool explicit_headers = false; // {headers, [List]}: no header row in data
ERL_NIF_TERM null_term = 0;
std::vector<ERL_NIF_TERM> header_list; // populated when explicit_headers is true
std::vector<CSVFieldSpec> fields;
size_t skip_rows = 0; // {skip, N}: skip first N data rows
size_t limit = 0; // {limit, N}: max data rows (0 = unlimited)
// When true, fields are copied into fresh binaries instead of referencing
// the input via sub-binary. Use this when decoded fields will outlive the
// input binary by a large margin: without it, one live field keeps the
// entire input buffer from being collected.
bool copy_strings = false;
};
// Parses a `field_type()`:
// integer | {float, Precision} | boolean | {datetime, InputFormat}
// | binary | charlist | existing_atom | {atom, ExistingAtoms :: list(atom())}
static bool parse_csv_field_type(ErlNifEnv* env, ERL_NIF_TERM term, CSVFieldSpec& spec)
{
if (enif_is_identical(term, AM_INTEGER)) { spec.type = CSVFieldType::integer; return true; }
if (enif_is_identical(term, AM_BOOLEAN)) { spec.type = CSVFieldType::boolean; return true; }
if (enif_is_identical(term, AM_BINARY)) { spec.type = CSVFieldType::binary; return true; }
if (enif_is_identical(term, AM_CHARLIST)) { spec.type = CSVFieldType::charlist; return true; }
if (enif_is_identical(term, AM_EXISTING_ATOM)) { spec.type = CSVFieldType::existing_atom; return true; }
int arity; const ERL_NIF_TERM* tp;
if (!enif_get_tuple(env, term, &arity, &tp) || arity != 2) return false;
if (enif_is_identical(tp[0], AM_FLOAT)) {
int precision;
if (!enif_get_int(env, tp[1], &precision) || precision < 0) return false;
spec.type = CSVFieldType::floatp;
spec.precision = precision;
spec.scale = glz::power(10LL, static_cast<size_t>(precision));
return true;
}
if (enif_is_identical(tp[0], AM_DATETIME)) {
ErlNifBinary bin;
if (!enif_inspect_binary(env, tp[1], &bin)) return false;
spec.type = CSVFieldType::datetime;
spec.format.assign(reinterpret_cast<const char*>(bin.data), bin.size);
return true;
}
if (enif_is_identical(tp[0], AM_ATOM)) {
if (!enif_is_list(env, tp[1])) return false;
std::vector<std::string> atoms;
ERL_NIF_TERM ahead, atail = tp[1];
char buf[256];
while (enif_get_list_cell(env, atail, &ahead, &atail)) {
std::string_view sv;
if (!atom_to_sv(env, ahead, buf, sizeof(buf), sv)) return false;
atoms.emplace_back(sv);
}
spec.type = CSVFieldType::atom_list;
spec.atoms = std::move(atoms);
return true;
}
return false;
}
// Parses a single element of `{fields, [FieldType, ...]}`, where:
// FieldType :: field_type()
// | #{type => field_type(), default => Term, on_failure => binary | raise | default | null}
// field_type() :: integer | {float, Precision} | boolean | {datetime, InputFormat}
// | binary | charlist | existing_atom | {atom, ExistingAtoms :: list(atom())}
static bool parse_csv_field_spec(ErlNifEnv* env, ERL_NIF_TERM term, CSVFieldSpec& spec)
{
if (!enif_is_map(env, term))
return parse_csv_field_type(env, term, spec);
ERL_NIF_TERM type_term;
if (!enif_get_map_value(env, term, AM_TYPE, &type_term)) return false;
if (!parse_csv_field_type(env, type_term, spec)) return false;
ERL_NIF_TERM default_term;
if (enif_get_map_value(env, term, AM_DEFAULT, &default_term)) {
spec.default_term = default_term;
spec.has_default = true;
}
ERL_NIF_TERM on_failure_term;
if (enif_get_map_value(env, term, AM_ON_FAILURE, &on_failure_term)) {
if (enif_is_identical(on_failure_term, AM_BINARY)) spec.on_failure = CSVOnFailure::binary;
else if (enif_is_identical(on_failure_term, AM_RAISE)) spec.on_failure = CSVOnFailure::raise;
else if (enif_is_identical(on_failure_term, AM_DEFAULT)) spec.on_failure = CSVOnFailure::default_value;
else if (enif_is_identical(on_failure_term, AM_NULL)) spec.on_failure = CSVOnFailure::null;
else return false;
}
return true;
}
static bool parse_csv_decode_opts(ErlNifEnv* env, ERL_NIF_TERM list, CSVDecodeOpts& opts)
{
ERL_NIF_TERM head, tail = list;
while (enif_get_list_cell(env, tail, &head, &tail)) {
if (enif_is_identical(head, AM_HEADERS)) { opts.headers = true; continue; }
if (enif_is_identical(head, AM_COPY_STRINGS)) { opts.copy_strings = true; continue; }
int arity; const ERL_NIF_TERM* tp;
if (!enif_get_tuple(env, head, &arity, &tp) || arity != 2) continue;
if (enif_is_identical(tp[0], AM_DELIMITER)) {
int cp;
if (!enif_get_int(env, tp[1], &cp) || cp <= 0 || cp > 0x7F) return false;
opts.delimiter = static_cast<char>(cp);
} else if (enif_is_identical(tp[0], AM_NULL_TERM)) {
if (!enif_is_atom(env, tp[1])) return false;
opts.null_term = tp[1];
} else if (enif_is_identical(tp[0], AM_RETURN)) {
if (enif_is_identical(tp[1], AM_MAP)) opts.return_kind = CSVReturnKind::map;
else if (enif_is_identical(tp[1], AM_TUPLE)) opts.return_kind = CSVReturnKind::tuple;
else if (enif_is_identical(tp[1], AM_LIST)) opts.return_kind = CSVReturnKind::list;
} else if (enif_is_identical(tp[0], AM_HEADERS)) {
// {headers, [List]} — explicit column names, no header row in data
// {headers, binary | string} — 1st row → binary keys
// {headers, atom} — 1st row → atom keys
// {headers, existing_atom} — 1st row → existing-atom keys
// {headers, charlist} — 1st row → charlist keys
if (enif_is_list(env, tp[1])) {
opts.headers = true;
opts.explicit_headers = true;
ERL_NIF_TERM hhead, htail = tp[1];
while (enif_get_list_cell(env, htail, &hhead, &htail))
opts.header_list.push_back(hhead);
} else if (enif_is_identical(tp[1], AM_BINARY) || enif_is_identical(tp[1], AM_STRING)) {
opts.headers = true;
opts.hdr_atom = false;
opts.hdr_existing_atom = false;
opts.hdr_charlist = false;
} else if (enif_is_identical(tp[1], AM_ATOM)) {
opts.headers = true;
opts.hdr_atom = true;
} else if (enif_is_identical(tp[1], AM_EXISTING_ATOM)) {
opts.headers = true;
opts.hdr_existing_atom = true;
} else if (enif_is_identical(tp[1], AM_CHARLIST)) {
opts.headers = true;
opts.hdr_charlist = true;
}
} else if (enif_is_identical(tp[0], AM_SKIP)) {
// {skip, N} or {skip, {From, To}} (1-based, inclusive)
ErlNifUInt64 n;
if (enif_get_uint64(env, tp[1], &n)) {
opts.skip_rows = static_cast<size_t>(n);
} else {
int arity2; const ERL_NIF_TERM* tp2;
if (enif_get_tuple(env, tp[1], &arity2, &tp2) && arity2 == 2) {
ErlNifUInt64 from, to;
if (enif_get_uint64(env, tp2[0], &from) && enif_get_uint64(env, tp2[1], &to)
&& from >= 1 && to >= from) {
opts.skip_rows = static_cast<size_t>(from - 1);
opts.limit = static_cast<size_t>(to - from + 1);
}
}
}
} else if (enif_is_identical(tp[0], AM_LIMIT)) {
ErlNifUInt64 n;
if (enif_get_uint64(env, tp[1], &n))
opts.limit = static_cast<size_t>(n);
} else if (enif_is_identical(tp[0], AM_FIELDS)) {
if (!enif_is_list(env, tp[1])) return false;
ERL_NIF_TERM fhead, ftail = tp[1];
std::vector<CSVFieldSpec> fields;
while (enif_get_list_cell(env, ftail, &fhead, &ftail)) {
CSVFieldSpec spec;
if (!parse_csv_field_spec(env, fhead, spec)) return false;
fields.push_back(spec);
}
opts.fields = std::move(fields);
}
}
if (opts.return_kind == CSVReturnKind::map && !opts.headers)
opts.return_kind = CSVReturnKind::list;
return true;
}
struct CSVEncodeOpts {
char delimiter = ',';
bool headers = false;
bool explicit_headers = false; // {headers, [List]}: explicit column order
std::vector<ERL_NIF_TERM> header_list; // populated when explicit_headers is true
std::string_view line_ending = "\r\n";
};
static bool parse_csv_encode_opts(ErlNifEnv* env, ERL_NIF_TERM list, CSVEncodeOpts& opts)
{
ERL_NIF_TERM head, tail = list;
while (enif_get_list_cell(env, tail, &head, &tail)) {
if (enif_is_identical(head, AM_HEADERS)) { opts.headers = true; continue; }
int arity; const ERL_NIF_TERM* tp;
if (!enif_get_tuple(env, head, &arity, &tp) || arity != 2) continue;
if (enif_is_identical(tp[0], AM_DELIMITER)) {
int cp;
if (!enif_get_int(env, tp[1], &cp) || cp <= 0 || cp > 0x7F) return false;
opts.delimiter = static_cast<char>(cp);
} else if (enif_is_identical(tp[0], AM_LINE_ENDING)) {
if (enif_is_identical(tp[1], AM_LF)) opts.line_ending = "\n";
else if (enif_is_identical(tp[1], AM_CRLF)) opts.line_ending = "\r\n";
else return false;
} else if (enif_is_identical(tp[0], AM_HEADERS)) {
// {headers, [Names]} — explicit column order/names for map-row encoding
if (!enif_is_list(env, tp[1])) return false;
opts.headers = true;
opts.explicit_headers = true;
ERL_NIF_TERM hhead, htail = tp[1];
while (enif_get_list_cell(env, htail, &hhead, &htail))
opts.header_list.push_back(hhead);
}
}
return true;
}
//-----------------------------------------------------------------------------
// SIMD helpers — field scanners used by both decoder and encoder.
//
// find_field_end: first of delimiter | '\n' | '\r' (unquoted-field boundary)
// find_csv_special: first of delimiter | '"' | '\n' | '\r' (needs-quoting check)
// find_byte (glazer_common.hpp): first '"' (quoted-field / encoder)
//
// find_field_end and find_csv_special cascade AVX2 → SSE2 → scalar.
// find_byte is the shared single-byte scanner from glazer_common.hpp.
//-----------------------------------------------------------------------------
static const char* find_field_end(const char* p, const char* end, char delim) noexcept
{
#if defined(__ARM_NEON__)
{
const uint8x16_t vd = vdupq_n_u8(static_cast<uint8_t>(delim));
const uint8x16_t vn = vdupq_n_u8('\n');
const uint8x16_t vr = vdupq_n_u8('\r');
while (p + 16 <= end) {
uint8x16_t v = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
uint8x16_t hit = vorrq_u8(vorrq_u8(vceqq_u8(v, vd), vceqq_u8(v, vn)), vceqq_u8(v, vr));
uint64x2_t h64 = vreinterpretq_u64_u8(hit);
uint64_t lo = vgetq_lane_u64(h64, 0);
uint64_t hi = vgetq_lane_u64(h64, 1);
if (lo | hi) {
if (lo) return p + (__builtin_ctzll(lo) >> 3);
return p + 8 + (__builtin_ctzll(hi) >> 3);
}
p += 16;
}
}
#endif
#if defined(__AVX2__)
{
const __m256i vd = _mm256_set1_epi8(delim);
const __m256i vn = _mm256_set1_epi8('\n');
const __m256i vr = _mm256_set1_epi8('\r');
while (p + 32 <= end) {
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
uint32_t mask = (uint32_t)_mm256_movemask_epi8(_mm256_or_si256(
_mm256_or_si256(_mm256_cmpeq_epi8(v, vd), _mm256_cmpeq_epi8(v, vn)),
_mm256_cmpeq_epi8(v, vr)));
if (mask) return p + __builtin_ctz(mask);
p += 32;
}
}
#endif
#if defined(__SSE2__)
{
const __m128i vd = _mm_set1_epi8(delim);
const __m128i vn = _mm_set1_epi8('\n');
const __m128i vr = _mm_set1_epi8('\r');
while (p + 16 <= end) {
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
unsigned mask = (unsigned)_mm_movemask_epi8(_mm_or_si128(
_mm_or_si128(_mm_cmpeq_epi8(v, vd), _mm_cmpeq_epi8(v, vn)),
_mm_cmpeq_epi8(v, vr)));
if (mask) return p + __builtin_ctz(mask);
p += 16;
}
}
#endif
while (p < end && *p != delim && *p != '\n' && *p != '\r') ++p;
return p;
}
// Finds the first byte in [p, end) that requires the field to be quoted:
// the delimiter, a double-quote, a newline, or a carriage return.
static const char* find_csv_special(const char* p, const char* end, char delim) noexcept
{
#if defined(__ARM_NEON__)
{
const uint8x16_t vd = vdupq_n_u8(static_cast<uint8_t>(delim));
const uint8x16_t vq = vdupq_n_u8('"');
const uint8x16_t vn = vdupq_n_u8('\n');
const uint8x16_t vr = vdupq_n_u8('\r');
while (p + 16 <= end) {
uint8x16_t v = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
uint8x16_t hit = vorrq_u8(vorrq_u8(vceqq_u8(v, vd), vceqq_u8(v, vq)),
vorrq_u8(vceqq_u8(v, vn), vceqq_u8(v, vr)));
uint64x2_t h64 = vreinterpretq_u64_u8(hit);
uint64_t lo = vgetq_lane_u64(h64, 0);
uint64_t hi = vgetq_lane_u64(h64, 1);
if (lo | hi) {
if (lo) return p + (__builtin_ctzll(lo) >> 3);
return p + 8 + (__builtin_ctzll(hi) >> 3);
}
p += 16;
}
}
#endif
#if defined(__AVX2__)
{
const __m256i vd = _mm256_set1_epi8(delim);
const __m256i vq = _mm256_set1_epi8('"');
const __m256i vn = _mm256_set1_epi8('\n');
const __m256i vr = _mm256_set1_epi8('\r');
while (p + 32 <= end) {
__m256i v = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
uint32_t mask = (uint32_t)_mm256_movemask_epi8(_mm256_or_si256(_mm256_or_si256(
_mm256_or_si256(_mm256_cmpeq_epi8(v, vd), _mm256_cmpeq_epi8(v, vq)),
_mm256_cmpeq_epi8(v, vn)), _mm256_cmpeq_epi8(v, vr)));
if (mask) return p + __builtin_ctz(mask);
p += 32;
}
}
#endif
#if defined(__SSE2__)
{
const __m128i vd = _mm_set1_epi8(delim);
const __m128i vq = _mm_set1_epi8('"');
const __m128i vn = _mm_set1_epi8('\n');
const __m128i vr = _mm_set1_epi8('\r');
while (p + 16 <= end) {
__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
unsigned mask = (unsigned)_mm_movemask_epi8(_mm_or_si128(_mm_or_si128(
_mm_or_si128(_mm_cmpeq_epi8(v, vd), _mm_cmpeq_epi8(v, vq)),
_mm_cmpeq_epi8(v, vn)), _mm_cmpeq_epi8(v, vr)));
if (mask) return p + __builtin_ctz(mask);
p += 16;
}
}
#endif
while (p < end && *p != delim && *p != '"' && *p != '\n' && *p != '\r') ++p;
return p;
}
//-----------------------------------------------------------------------------
// CSVDecoder
//-----------------------------------------------------------------------------
struct CSVDecoder {
ErlNifEnv* m_env;
const CSVDecodeOpts& m_opts;
const char* m_beg;
const char* m_p;
const char* m_end;
ERL_NIF_TERM m_input_bin; // original binary term — used for zero-copy sub_binary
CSVDecoder(ErlNifEnv* e, const CSVDecodeOpts& o, const char* data, size_t size,
ERL_NIF_TERM input_bin)
: m_env(e), m_opts(o), m_beg(data), m_p(data), m_end(data + size), m_input_bin(input_bin) {}
static bool is_eol(char c) { return c == '\n' || c == '\r'; }
void skip_eol() {
if (m_p < m_end && *m_p == '\r') ++m_p;
if (m_p < m_end && *m_p == '\n') ++m_p;
}
// Makes a field term from a span of the input that needs no unescaping.
// Default (copy_strings == false): a zero-copy sub-binary referencing the
// original input — no allocation, but the input binary stays alive as
// long as any sub-binary referencing it does.
// With copy_strings == true: always allocates a fresh binary, allowing the
// GC to reclaim the input buffer independently of the decoded results.
ERL_NIF_TERM make_raw_field_term(std::string_view sv)
{
return make_span_term(m_env, m_input_bin, m_beg, m_end, sv, m_opts.copy_strings);
}
// Unescapes doubled quotes by copying the runs *between* them in bulk,
// rather than byte-by-byte — for a field with a handful of escaped quotes
// this is a handful of memcpy calls instead of one push_back per byte.
// Writes directly into the destination Erlang binary (sized exactly to
// the unescaped length up front) instead of building an intermediate
// std::string and copying it again via make_binary: one allocation and
// one pass over the bytes instead of two of each.
ERL_NIF_TERM make_field_term(std::string_view sv, bool has_quote_escape)
{
if (!has_quote_escape) return make_raw_field_term(sv);
size_t escapes = 0;
for (size_t i = 0; i + 1 < sv.size(); ++i)
if (sv[i] == '"' && sv[i+1] == '"') { ++escapes; ++i; }
auto [term, dst] = make_binary(m_env, sv.size() - escapes);
size_t start = 0, out = 0;
for (size_t i = 0; i < sv.size(); ++i) {
if (sv[i] == '"' && i + 1 < sv.size() && sv[i+1] == '"') {
size_t n = i + 1 - start;
memcpy(dst + out, sv.data() + start, n); // include first quote
out += n;
start = i + 2; // skip the second quote
++i;
}
}
memcpy(dst + out, sv.data() + start, sv.size() - start);
return term;
}
// Reads one field starting at m_p. Advances m_p past the field (not past
// the following delimiter/EOL). Sets `eol`/`eof` flags for the caller and
// `more_fields` to false if the field was the last one on the line.
ERL_NIF_TERM read_field(bool& ok)
{
ok = true;
if (m_p < m_end && *m_p == '"') {
++m_p; // opening quote
const char* start = m_p;
bool has_escape = false;
for (;;) {
// SIMD: jump directly to the next '"' (or end) in one scan.
m_p = find_byte(m_p, m_end, '"');
if (m_p >= m_end) { ok = false; return 0; } // unterminated quoted field
if (m_p + 1 < m_end && m_p[1] == '"') { has_escape = true; m_p += 2; continue; }
break; // closing quote
}
std::string_view sv(start, static_cast<size_t>(m_p - start));
++m_p; // closing quote
return make_field_term(sv, has_escape);
}
// SIMD: advance past bytes that are not the delimiter, '\n', or '\r'.
const char* start = m_p;
m_p = find_field_end(m_p, m_end, m_opts.delimiter);
return make_raw_field_term(std::string_view(start, static_cast<size_t>(m_p - start)));
}
// Reads one record (row) into `fields`. Returns false at end of input with
// no fields read (clean EOF). On a malformed quoted field, sets `err`.
template<size_t N>
bool read_record(SmallTermVec<N>& fields, bool& err)
{
err = false;
fields.set_size(0);
if (m_p >= m_end) return false;
// Skip blank lines.
while (m_p < m_end && is_eol(*m_p)) skip_eol();
if (m_p >= m_end) return false;
for (;;) {
bool ok;
auto field = read_field(ok);
if (!ok) [[unlikely]] { err = true; return false; }
fields.push_back(field);
if (m_p < m_end && *m_p == m_opts.delimiter) { ++m_p; continue; }
if (m_p < m_end && is_eol(*m_p)) { skip_eol(); }
break;
}
return true;
}
// Attempts the type-specific conversion of `sv` per `spec`. Returns 0 on a
// value that doesn't match the requested type.
ERL_NIF_TERM try_convert(std::string_view sv, const CSVFieldSpec& spec)
{
switch (spec.type) {
case CSVFieldType::integer: {
if (sv.empty()) return 0;
ERL_NIF_TERM r = glz::BigInt::decode(m_env, sv.data(), sv.data() + sv.size());
return r;
}
case CSVFieldType::floatp: {
if (sv.empty())
return 0;
double d;
auto [ep, ec] = glz::fast_float::from_chars(sv.data(), sv.data() + sv.size(), d);
if (ec != std::errc{} || ep != sv.data() + sv.size()) [[unlikely]]
return 0;
d = std::round(d * spec.scale) / spec.scale;
return enif_make_double(m_env, d);
}
case CSVFieldType::boolean: {
if (sv == "true" || sv == "TRUE" || sv == "True") return AM_TRUE;
if (sv == "false" || sv == "FALSE" || sv == "False") return AM_FALSE;
return 0;
}
case CSVFieldType::datetime: {
auto secs = datetime::parse(sv, spec.format);
if (!secs) [[unlikely]] return 0;
return enif_make_int64(m_env, *secs);
}
case CSVFieldType::charlist: {
std::vector<ERL_NIF_TERM> cps;
cps.reserve(sv.size());
const char* p = sv.data();
const char* e = p + sv.size();
while (p < e) cps.push_back(enif_make_uint(m_env, decode_utf8(p, e)));
return enif_make_list_from_array(m_env, cps.data(), unsigned(cps.size()));
}
case CSVFieldType::existing_atom: {
ERL_NIF_TERM t;
return enif_make_existing_atom_len(m_env, sv.data(), sv.size(), &t, ERL_NIF_LATIN1)
? t : 0;
}
case CSVFieldType::atom_list: {
for (auto& name : spec.atoms) {
if (sv == name) {
ERL_NIF_TERM t;
return enif_make_existing_atom_len(m_env, sv.data(), sv.size(), &t, ERL_NIF_LATIN1)
? t : 0;
}
}
return 0;
}
default:
return 0;
}
}
// Converts one field term (a binary) per `spec`. Returns 0 to signal
// {invalid_field_value, Row, Col} (only when `spec.on_failure == raise`);
// otherwise always returns a valid term, falling back per `on_failure`.
ERL_NIF_TERM convert_field(ERL_NIF_TERM term, const CSVFieldSpec& spec)
{
if (spec.type == CSVFieldType::none || spec.type == CSVFieldType::binary)
return term;
ErlNifBinary bin;
if (!enif_inspect_binary(m_env, term, &bin))
return 0;
auto sv = std::string_view(reinterpret_cast<const char*>(bin.data), bin.size);
if (sv.empty() && spec.has_default)
return spec.default_term;
ERL_NIF_TERM r = try_convert(sv, spec);
if (r) [[likely]] return r;
switch (spec.on_failure) {
case CSVOnFailure::binary: return term;
case CSVOnFailure::raise: return 0;
case CSVOnFailure::default_value: return spec.has_default ? spec.default_term : term;
case CSVOnFailure::null: return m_opts.null_term;
}
return term;
}
// Converts `fields` in place per `m_opts.fields` (positional by column
// index; columns beyond the spec list, or with the `none`/default spec,
// are left as binaries). Returns the 1-based column index of the first
// field that fails to convert, or 0 on success.
template<size_t N>
size_t convert_fields(SmallTermVec<N>& fields)
{
size_t n = std::min(fields.size(), m_opts.fields.size());
for (size_t i = 0; i < n; ++i) {
auto converted = convert_field(fields[i], m_opts.fields[i]);
if (!converted) [[unlikely]]
return i + 1;
fields.data()[i] = converted;
}
return 0;
}
ERL_NIF_TERM make_header_key(ERL_NIF_TERM bin_term)
{
if (!m_opts.hdr_atom && !m_opts.hdr_existing_atom && !m_opts.hdr_charlist)
return bin_term;
ErlNifBinary bin;
if (!enif_inspect_binary(m_env, bin_term, &bin)) return bin_term;
auto sv = std::string_view(reinterpret_cast<const char*>(bin.data), bin.size);
if (m_opts.hdr_atom)
return enif_make_atom_len(m_env, sv.data(), sv.size());
if (m_opts.hdr_existing_atom) {
ERL_NIF_TERM t;
return enif_make_existing_atom_len(m_env, sv.data(), sv.size(), &t, ERL_NIF_LATIN1)
? t : bin_term;
}
// charlist: decode UTF-8 bytes to a list of Unicode codepoints
std::vector<ERL_NIF_TERM> cps;
const char* p = sv.data();
const char* end = p + sv.size();
while (p < end)
cps.push_back(enif_make_uint(m_env, decode_utf8(p, end)));
return enif_make_list_from_array(m_env, cps.data(), unsigned(cps.size()));
}
// Decodes the entire input.
// Returns:
// - success: {true, #{headers => nil|[binary()|atom()], data => [[term()]]|[map()]}}
// - failure: {false, atom | binary}
//
// When `headers` is set the first row is extracted as the `headers` value
// (applying `{headers, atom|existing_atom|charlist}` if requested). Subsequent rows are
// emitted as field lists (default), as tuples when {return, tuple} is set,
// or as maps keyed by the header names when {return, map} is also set.
// duplicate_header is returned if a header row contains duplicate column
// names and map output is requested.
std::tuple<bool, ERL_NIF_TERM> decode()
{
SmallTermVec<64> fields;
SmallTermVec<64> header;
std::vector<ERL_NIF_TERM> rows;
// Conservative row-count estimate (4 B/row minimum, e.g. "a,b\n") avoids
// the ~15 doubling reallocations a 25K-row file would otherwise pay as
// `rows` grows one push_back at a time; an overestimate only wastes
// unused capacity, never time, so erring high for short rows is fine.
rows.reserve((m_end - m_beg) / 4);
size_t row_num = 0;
bool err = false;
auto rec = read_record(fields, err);
if (!rec) [[unlikely]]
goto DONE;
// Populate header: explicit list beats reading a header row from the data.
if (m_opts.explicit_headers) {
for (auto h : m_opts.header_list)
header.push_back(h);
// Do NOT consume a data row as the header.
} else if (m_opts.headers) {
for (auto field : fields)
header.push_back(make_header_key(field));
rec = read_record(fields, err);
}
// Skip the requested number of leading data rows.
{
size_t skipped = 0;
while (rec && skipped < m_opts.skip_rows) {
rec = read_record(fields, err);
++skipped;
}
}
// Data rows: maps, tuples, or field lists per {return, ...} (map implies
// headers — enforced by parse_csv_decode_opts). Dispatch on return_kind
// once here rather than per-record inside the loop below.
switch (m_opts.return_kind) {
case CSVReturnKind::map:
while (rec && (m_opts.limit == 0 || row_num < m_opts.limit)) {
++row_num;
if (!m_opts.fields.empty()) {
if (size_t col = convert_fields(fields))
return std::make_tuple(
false,
enif_make_tuple3(m_env, AM_INVALID_FIELD_VALUE,
enif_make_uint64(m_env, row_num),
enif_make_uint64(m_env, col)));
}
ERL_NIF_TERM map = fields.to_erl_map(m_env, header);
if (!map) [[unlikely]]
return std::make_tuple(false, AM_DUPLICATE_HEADER);
rows.push_back(map);
rec = read_record(fields, err);
}
break;
case CSVReturnKind::tuple:
while (rec && (m_opts.limit == 0 || row_num < m_opts.limit)) {
++row_num;
if (!m_opts.fields.empty()) {
if (size_t col = convert_fields(fields))
return std::make_tuple(
false,
enif_make_tuple3(m_env, AM_INVALID_FIELD_VALUE,
enif_make_uint64(m_env, row_num),
enif_make_uint64(m_env, col)));
}
rows.push_back(fields.to_erl_tuple(m_env));
rec = read_record(fields, err);
}
break;
case CSVReturnKind::list:
while (rec && (m_opts.limit == 0 || row_num < m_opts.limit)) {
++row_num;
if (!m_opts.fields.empty()) {
if (size_t col = convert_fields(fields))
return std::make_tuple(
false,
enif_make_tuple3(m_env, AM_INVALID_FIELD_VALUE,
enif_make_uint64(m_env, row_num),
enif_make_uint64(m_env, col)));
}
rows.push_back(fields.to_erl_list(m_env));
rec = read_record(fields, err);
}
break;
}
DONE:
if (err) [[unlikely]]
return std::make_tuple(false, AM_UNTERMINATED_QUOTED_FIELD);
ERL_NIF_TERM headers_term = m_opts.headers
? enif_make_list_from_array(m_env, header.data(), unsigned(header.size()))
: AM_NIL;
ERL_NIF_TERM data_term =
enif_make_list_from_array(m_env, rows.data(), unsigned(rows.size()));
ERL_NIF_TERM keys[2] = { AM_HEADERS, AM_DATA };
ERL_NIF_TERM vals[2] = { headers_term, data_term };
ERL_NIF_TERM result;
enif_make_map_from_arrays(m_env, keys, vals, 2, &result);
return std::make_tuple(true, result);
}
};
//-----------------------------------------------------------------------------
// Encoder
//-----------------------------------------------------------------------------
struct CSVEncoder {
ErlNifEnv* m_env;
const CSVEncodeOpts& m_opts;
OutBuf& m_out;
const char* m_err;
ERL_NIF_TERM m_err_term;
CSVEncoder(ErlNifEnv* e, const CSVEncodeOpts& o, OutBuf& out)
: m_env(e), m_opts(o), m_out(out), m_err(""), m_err_term(AM_NIL) {}
void push_field_raw(std::string_view sv)
{
const char* p = sv.data();
const char* end = p + sv.size();
// SIMD: check whether quoting is needed at all.
if (find_csv_special(p, end, m_opts.delimiter) == end) { m_out.push(sv); return; }
// Field contains a special byte — wrap in double quotes and double any
// embedded '"' characters. SIMD locates each '"' for bulk-copy runs.
m_out.push('"');
for (;;) {
const char* q = find_byte(p, end, '"');
m_out.push(p, q - p); // bulk-copy bytes up to (not including) the quote
if (q >= end) break;
m_out.push("\"\"", 2); // double the embedded quote
p = q + 1;
}
m_out.push('"');
}
// Encodes one field term (binary, atom, integer, or float). Returns false
// for unsupported term types.
bool encode_field(ERL_NIF_TERM term)
{
switch (enif_term_type(m_env, term)) {
case ERL_NIF_TERM_TYPE_BITSTRING: {
ErlNifBinary bin;
if (!enif_inspect_binary(m_env, term, &bin)) [[unlikely]]
return false;
push_field_raw({reinterpret_cast<const char*>(bin.data), bin.size});
return true;
}
case ERL_NIF_TERM_TYPE_INTEGER:
return glz::BigInt::encode(m_env, term, m_out);
case ERL_NIF_TERM_TYPE_FLOAT: {
double d;
if (!enif_get_double(m_env, term, &d)) [[unlikely]]
return false;
char buf[32];
auto [e, ec] = std::to_chars(buf, buf+32, d, std::chars_format::general);
if (ec != std::errc{}) [[unlikely]]
return false;
m_out.push(buf, e - buf);
return true;
}
case ERL_NIF_TERM_TYPE_ATOM: {
char buf[256];
std::string_view sv;
if (!atom_to_sv(m_env, term, buf, sizeof(buf), sv)) [[unlikely]]
return false;
push_field_raw(sv);
return true;
}
default:
return false;
}
}
bool encode_row(ERL_NIF_TERM row)
{
ERL_NIF_TERM head, tail = row;
bool first = true;
while (enif_get_list_cell(m_env, tail, &head, &tail)) {
if (!first) m_out.push(m_opts.delimiter);
first = false;
if (!encode_field(head)) [[unlikely]]
return error("cannot encode CSV field", head);
}
if (!enif_is_empty_list(m_env, tail)) [[unlikely]]
return error("cannot encode improper list as CSV row", tail);
m_out.push(m_opts.line_ending);
return true;
}
bool encode_map_row(ERL_NIF_TERM map, const std::vector<ERL_NIF_TERM>& header)
{
for (size_t i = 0; i < header.size(); ++i) {
if (i > 0) m_out.push(m_opts.delimiter);
ERL_NIF_TERM val;
if (!enif_get_map_value(m_env, map, header[i], &val)) continue; // missing key -> empty field
if (!encode_field(val)) [[unlikely]]
return error("cannot encode CSV field", val);
}
m_out.push(m_opts.line_ending);
return true;
}
// `term` is a list of rows (each a list of fields), or — with `headers` —
// a list of maps.
bool encode(ERL_NIF_TERM term)
{
if (!enif_is_list(m_env, term)) { m_err = "expected a list of rows"; return false; }
if (enif_is_empty_list(m_env, term)) return true;
if (m_opts.headers) {
std::vector<ERL_NIF_TERM> header;
if (m_opts.explicit_headers) {
// Use the explicitly given column order/names.
header = m_opts.header_list;
} else {
// Determine column order from the first row's map keys.
ERL_NIF_TERM head, tail = term;
enif_get_list_cell(m_env, tail, &head, &tail);
if (enif_term_type(m_env, head) != ERL_NIF_TERM_TYPE_MAP) [[unlikely]]
return error("headers option requires rows to be maps", head);
ERL_NIF_TERM key, val;
auto it = MapIterator::create(m_env, head);
if (!it) [[unlikely]]
return error("failed to create a map iterator", AM_NIL);
while (it->get_pair(&key, &val)) {
header.push_back(key);
it->next();
}
}
// Emit header row.
for (size_t i = 0; i < header.size(); ++i) {
if (i > 0) m_out.push(m_opts.delimiter);
if (!encode_field(header[i])) [[unlikely]]
return error("cannot encode CSV header", header[i]);
}
m_out.push(m_opts.line_ending);
ERL_NIF_TERM row, rest = term;
while (enif_get_list_cell(m_env, rest, &row, &rest)) {
if (enif_term_type(m_env, row) != ERL_NIF_TERM_TYPE_MAP) [[unlikely]]
return error("headers option requires rows to be maps", row);
if (!encode_map_row(row, header)) [[unlikely]]
return false;
}
return true;
}
ERL_NIF_TERM row, rest = term;
while (enif_get_list_cell(m_env, rest, &row, &rest)) {
if (!enif_is_list(m_env, row)) [[unlikely]]
return error("expected each row to be a list", row);
if (!encode_row(row)) [[unlikely]]
return false;
}
return true;
}
private:
template <int N>
bool error(const char (&err)[N], ERL_NIF_TERM term) {
m_err = err;
m_err_term = term;
return false;
}
};
} // namespace glz