c_src/glazer_common.hpp

Select File
c_src/glazer_common.hpp

// vim:ts=2:sw=2:et
//-----------------------------------------------------------------------------
// Shared utilities used by both the JSON and YAML decoders/encoders:
// growable term/byte buffers, the object-key cache, and UTF-8/atom helpers.
//-----------------------------------------------------------------------------
#pragma once

#include <array>
#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <erl_nif.h>
#if defined(__SSE2__)
#  include <immintrin.h>
#endif
#if defined(__ARM_NEON__)
#  include <arm_neon.h>
#endif

namespace glz {

// Calculate `a` raised to the power of `b`.
template <typename T>
inline T power(T a, size_t b) {
  if (a == 0) return 0;

  T result = 1;
  for (; b > 0; b >>= 1) {
    if (b & 1) result *= a; // If b is odd, multiply the base with the result
    a *= a;
  }
  return result;
}

//-----------------------------------------------------------------------------
// Reduction-count bookkeeping for NIFs that run on a normal (non-dirty)
// scheduler. Without this, the BEAM charges a NIF call a flat 1 reduction
// regardless of how much CPU it actually used, which can let
// CPU-heavy-but-inline calls starve other runnable processes on the same
// scheduler. enif_consume_timeslice(env, percent) tells the scheduler what
// percentage of a timeslice (1-100) was consumed so its reduction
// accounting reflects the real work done before returning to Erlang.
//-----------------------------------------------------------------------------

static constexpr size_t BYTES_PER_REDUCTION = 20;
static constexpr size_t REDUCTION_COUNT     = 4000;

// Report the percentage of a timeslice consumed while processing `bytes`
// bytes, so the scheduler updates the process's reduction count instead of
// charging a flat 1 reduction for the NIF call. The return value of
// enif_consume_timeslice (1 if the process should yield/be preempted, 0
// otherwise) is normally used to drive cooperative scheduling for NIFs that
// process work in chunks across multiple calls. Here it is intentionally
// ignored: work that's long enough to need preemption is offloaded to a
// dirty scheduler instead, so this function is only reached for inline
// (small, sub-DIRTY_THRESHOLD) calls, where it serves purely to keep the
// reduction count accurate.
inline void update_reduction_count([[maybe_unused]] ErlNifEnv* env, [[maybe_unused]] size_t bytes) {
#if ERL_NIF_MAJOR_VERSION > 2 || (ERL_NIF_MAJOR_VERSION == 2 && ERL_NIF_MINOR_VERSION >= 4)
  size_t reds = bytes / BYTES_PER_REDUCTION;
  int percent = static_cast<int>(reds * 100 / REDUCTION_COUNT);
  if (percent < 1)   percent = 1;
  if (percent > 100) percent = 100;
  (void)enif_consume_timeslice(env, percent);
#endif
}

//-----------------------------------------------------------------------------
// Small inline-capacity buffer for term arrays built while parsing
// arrays/objects — avoids heap allocation for the common case (most
// containers have only a handful of elements).
//-----------------------------------------------------------------------------

template <size_t N>
struct SmallTermVec {
  ERL_NIF_TERM  m_inline[N];
  ERL_NIF_TERM* m_data = m_inline;
  size_t        m_len  = 0;
  size_t        m_cap  = N;

  ~SmallTermVec() { if (m_data != m_inline) delete[] m_data; }

  void push_back(ERL_NIF_TERM v) {
    if (m_len == m_cap) [[unlikely]] {
      size_t nc = m_cap * 2;
      ERL_NIF_TERM* nb = new ERL_NIF_TERM[nc];
      memcpy(nb, m_data, m_len * sizeof(ERL_NIF_TERM));
      if (m_data != m_inline) delete[] m_data;
      m_data = nb; m_cap = nc;
    }
    m_data[m_len++] = v;
  }

  ERL_NIF_TERM operator[](size_t i) const { return m_data[i]; }

  const ERL_NIF_TERM* begin()       const { return m_data; }
  const ERL_NIF_TERM* end()         const { return m_data + m_len; }

  ERL_NIF_TERM*       begin()             { return m_data; }
  ERL_NIF_TERM*       end()               { return m_data + m_len; }

  ERL_NIF_TERM*       data()        const { return m_data; }
  size_t              size()        const { return m_len;  }
  void                set_size(size_t n)  { m_len = n; }

  ERL_NIF_TERM  to_erl_list(ErlNifEnv* env) const {
    return enif_make_list_from_array(env, m_data, unsigned(m_len));
  }

  ERL_NIF_TERM  to_erl_tuple(ErlNifEnv* env) const {
    return enif_make_tuple_from_array(env, m_data, unsigned(m_len));
  }

  // `this` holds values, `keys` holds the parallel array of keys.
  // Returns 0 on error (i.e. duplicate keys) or ERL_NIF_TERM on success.
  template <bool Dedupe = false, typename T = SmallTermVec<16>>
  ERL_NIF_TERM to_erl_map(ErlNifEnv* env, const T& keys) const {
    auto n = std::min(keys.size(), m_len);
    ERL_NIF_TERM map;
    if (!enif_make_map_from_arrays(env, keys.data(), m_data, unsigned(n), &map)) [[unlikely]]
      map = 0;

    if (Dedupe && !map) {
      // Dedupe, keeping last value for duplicate keys.
      map = enif_make_new_map(env);
      for (auto p = m_data, q = keys.data(), e = p+n; p != e; ++p, ++q) {
        ERL_NIF_TERM next;
        enif_make_map_put(env, map, *q, *p, &next);
        map = next;
      }
    }

    return map;
  }
};

//-----------------------------------------------------------------------------
// Hex digit lookup — shared by the JSON \uXXXX and YAML \xXX/\uXXXX/\UXXXXXXXX
// escape decoders. A 256-entry table turns the 3-branch chain
// ('0'-'9' / 'a'-'f' / 'A'-'F') into a single array index.
//-----------------------------------------------------------------------------
static constexpr auto HEX_DIGIT_VALUES = [] {
  std::array<int8_t, 256> t{};
  t.fill(-1);
  for (int i = 0; i <= 9; ++i) t['0' + i] = static_cast<int8_t>(i);
  for (int i = 0; i <  6; ++i) {
    t['a' + i] = static_cast<int8_t>(10 + i);
    t['A' + i] = static_cast<int8_t>(10 + i);
  }
  return t;
}();

inline int hex_digit_value(unsigned char c)
{
  return HEX_DIGIT_VALUES[c];
}

//-----------------------------------------------------------------------------
// Zero-copy span term — shared by the JSON, YAML, and CSV decoders.
//
// Returns a sub-binary referencing `[beg, beg+end)` within `input_bin` when
// `copy_strings` is false (the default) and `sv` actually lies within
// `[beg, end)`: no allocation, but `input_bin` stays alive as long as any
// sub-binary referencing it is reachable. Falls back to copying — via
// make_binary — when `copy_strings` is true, or when `sv` does not point
// into the `[beg, end)` span (e.g. a scratch buffer used to fold/unescape
// multi-line scalars): callers may pass either a raw input slice or a
// locally-built buffer without needing to track which case applies.
//-----------------------------------------------------------------------------

inline ERL_NIF_TERM make_span_term(ErlNifEnv* env, ERL_NIF_TERM input_bin, const char* beg,
                                    const char* end, std::string_view sv, bool copy_strings)
{
  if (!copy_strings && sv.data() >= beg && sv.data() + sv.size() <= end)
    return enif_make_sub_binary(env, input_bin, static_cast<size_t>(sv.data() - beg), sv.size());
  auto [term, p] = make_binary(env, sv.size());
  memcpy(p, sv.data(), sv.size());
  return term;
}

//-----------------------------------------------------------------------------
// Output buffer — 4 KB inline, grows to heap
//-----------------------------------------------------------------------------

struct OutBuf {
  static constexpr size_t INLINE = 4096;

  char   m_inline[INLINE];
  char*  m_data;
  size_t m_len;
  size_t m_cap;

  OutBuf() : m_data(m_inline), m_len(0), m_cap(INLINE) {}
  ~OutBuf() { if (m_data != m_inline) free(m_data); }

  void ensure(size_t need) {
    if (m_len + need <= m_cap) [[likely]] return;
    size_t nc = m_cap * 2;
    while (nc < m_len + need) nc *= 2;
    if (m_data == m_inline) [[unlikely]] {
      // Can't realloc a stack array — first spill to the heap requires a copy.
      auto nb = std::unique_ptr<char[]>(static_cast<char*>(malloc(nc)));
      memcpy(nb.get(), m_data, m_len);
      m_data = nb.release();
    } else {
      // May resize in place (no copy) when the allocator can extend the block.
      m_data = static_cast<char*>(realloc(m_data, nc));
    }
    m_cap = nc;
  }

  void push(char c)                  { ensure(1); m_data[m_len++] = c; }
  void push(const char* s, size_t n) { ensure(n); memcpy(m_data + m_len, s, n); m_len += n; }
  void push(std::string_view sv)     { push(sv.data(), sv.size()); }

  std::string_view view() const      { return {m_data, m_len}; }

  operator std::string_view() const  { return view(); }
};

//-----------------------------------------------------------------------------
// Key cache — object/mapping keys repeat heavily within a document (e.g. a
// twitter feed has ~13K key occurrences but only ~94 distinct strings).
// Caching the resulting binary term lets repeated keys reuse the same
// already-built ERL_NIF_TERM instead of paying enif_make_new_binary + memcpy
// each time. Linear scan is fine — distinct-key counts are small in practice,
// and a capped size keeps pathological documents (huge unique-key counts)
// from paying scan overhead for no benefit.
//-----------------------------------------------------------------------------

struct KeyCache {
  // Open-addressed, power-of-two-sized table with linear probing. Sized
  // larger than the expected distinct-key count (real documents have ~94
  // distinct keys per the comment above) to keep the load factor low and
  // probe sequences short.
  static constexpr size_t CAP  = 128;
  static constexpr size_t MASK = CAP - 1;

  // Lazily-cleared via an epoch counter rather than zero-initializing the
  // whole array up front: a slot is "live" only if its `epoch` matches the
  // cache's current `m_epoch`. This avoids paying ~3KB of memset on every
  // single decode call (including tiny ones that never touch the cache —
  // see KEY_CACHE_MIN_SIZE) merely to construct the cache. `m_epoch` is
  // seeded from a process-wide monotonic counter, so leftover garbage from
  // prior stack frames can never coincide with it (it is always strictly
  // less than every epoch handed out so far).
  struct Entry { const char* s; size_t len; uint32_t hash; uint32_t epoch; ERL_NIF_TERM term; };
  Entry    m_entries[CAP]; // intentionally left uninitialized — see m_epoch
  size_t   m_count = 0;
  uint32_t m_epoch;

  static_assert((CAP & MASK) == 0, "CAP must be a power of two");

  static uint32_t next_epoch() {
    static std::atomic<uint32_t> counter{0};
    return counter.fetch_add(1, std::memory_order_relaxed) + 1; // never 0
  }

  KeyCache() : m_epoch(next_epoch()) {}

  // FNV-1a, mixed 4 bytes per multiply instead of 1 — cheap, decent
  // distribution, computed once per key and reused for both the lookup
  // and (on a miss) the subsequent insert.
  static uint32_t hash_of(const char* s, size_t len) {
    uint32_t h = 2166136261u;
    size_t   i = 0;
    for (; i + 4 <= len; i += 4) {
      uint32_t w;
      memcpy(&w, s + i, 4);
      h = (h ^ w) * 16777619u;
    }
    for (; i < len; ++i) {
      h ^= static_cast<unsigned char>(s[i]);
      h *= 16777619u;
    }
    return h;
  }

  // Returns 0 if not cached or cache is full/bypassed (has_escape).
  // O(1) average: jump straight to the hash's home slot and linearly probe
  // only the (typically very short, given the low load factor) collision
  // chain — comparing the precomputed hash before len/memcmp.
  ERL_NIF_TERM lookup(const char* s, size_t len, uint32_t hash) const {
    for (size_t i = hash & MASK, probes = 0; probes < CAP; ++probes, i = (i + 1) & MASK) {
      const Entry& e = m_entries[i];
      if (e.epoch != m_epoch) [[unlikely]] return 0; // empty slot — key was never inserted
      if (e.hash == hash && e.len == len && memcmp(e.s, s, len) == 0) [[likely]]
        return e.term;
    }
    return 0;
  }

  void insert(const char* s, size_t len, uint32_t hash, ERL_NIF_TERM term) {
    if (m_count >= CAP) return;
    for (size_t i = hash & MASK, probes = 0; probes < CAP; ++probes, i = (i + 1) & MASK) {
      if (m_entries[i].epoch != m_epoch) {
        m_entries[i] = {s, len, hash, m_epoch, term};
        ++m_count;
        return;
      }
    }
    // If we reach here, all slots appear to have matching epochs (likely due to
    // uninitialized memory containing the same epoch value). This is extremely
    // unlikely in normal operation but can happen due to uninitialized memory.
    // Simply return without inserting to prevent infinite loop.
  }
};

//-----------------------------------------------------------------------------
// RAII wrapper for ErlNifMapIterator — automatically calls
// enif_map_iterator_destroy on scope exit.
//
// Only constructible via the static factory:
//   auto iter = MapIterator::create(env, map);   // std::optional<MapIterator>
//   if (!iter) return false;
//   while (iter->get_pair(&k, &v)) { ...; iter->next(); }
//-----------------------------------------------------------------------------

struct MapIterator {
  MapIterator(const MapIterator&)            = delete;
  MapIterator& operator=(const MapIterator&) = delete;

  MapIterator(MapIterator&& o) noexcept
    : m_env(o.m_env), m_iter(o.m_iter), m_live(o.m_live)
  {
    o.m_live = false;
  }

  MapIterator& operator=(MapIterator&& o) noexcept
  {
    if (this != &o) {
      destroy();
      m_env    = o.m_env;
      m_iter   = o.m_iter;
      m_live   = o.m_live;
      o.m_live = false;
    }
    return *this;
  }

  ~MapIterator() { destroy(); }

  static std::optional<MapIterator> create(
    ErlNifEnv* env, ERL_NIF_TERM map,
    ErlNifMapIteratorEntry entry = ERL_NIF_MAP_ITERATOR_FIRST)
  {
    MapIterator it;
    if (!enif_map_iterator_create(env, map, &it.m_iter, entry))
      return std::nullopt;
    it.m_env  = env;
    it.m_live = true;
    return it;
  }

  bool get_pair(ERL_NIF_TERM* key, ERL_NIF_TERM* val)
  {
    return enif_map_iterator_get_pair(m_env, &m_iter, key, val);
  }

  void next() { enif_map_iterator_next(m_env, &m_iter); }

private:
  ErlNifEnv*        m_env;
  ErlNifMapIterator m_iter;
  bool              m_live{false};

  MapIterator() = default;

  void destroy()
  {
    if (m_live) {
      enif_map_iterator_destroy(m_env, &m_iter);
      m_live = false;
    }
  }
};

//-----------------------------------------------------------------------------
// Atom / UTF-8 helpers shared by the JSON and YAML encoders
//-----------------------------------------------------------------------------

inline bool atom_to_sv(ErlNifEnv* env, ERL_NIF_TERM atom, char* buf, size_t bufsz, std::string_view& out)
{
  unsigned len = 0;
  if (!enif_get_atom_length(env, atom, &len, ERL_NIF_LATIN1)) return false;
  if (len + 1 > bufsz) return false;
  enif_get_atom(env, atom, buf, len + 1, ERL_NIF_LATIN1);
  out = {buf, len};
  return true;
}

// Write a "\uXXXX" escape (6 bytes, lowercase hex) for a code unit
// cu <= 0xFFFF directly into dst, without going through snprintf.
inline void write_uescape(char* dst, uint32_t cu)
{
  static constexpr char     HEX[]    = "0123456789abcdef";
  static constexpr uint16_t PREFIX   = '\\' | ('u' << 8);
  memcpy(dst, &PREFIX, 2);

  uint32_t packed = (uint32_t(HEX[(cu >> 12) & 0xF])      ) |
                    (uint32_t(HEX[(cu >>  8) & 0xF]) <<  8) |
                    (uint32_t(HEX[(cu >>  4) & 0xF]) << 16) |
                    (uint32_t(HEX[ cu        & 0xF]) << 24);
  memcpy(dst + 2, &packed, 4);
}

// Emit a single Unicode code point as a \uXXXX escape (or a surrogate pair
// for code points beyond the BMP).
inline void push_uescape(OutBuf& out, uint32_t cp)
{
  char esc[6];
  if (cp <= 0xFFFF) {
    write_uescape(esc, cp);
    out.push(esc, 6);
  } else {
    cp -= 0x10000;
    uint32_t hi = 0xD800 + (cp >> 10);
    uint32_t lo = 0xDC00 + (cp & 0x3FF);
    write_uescape(esc, hi); out.push(esc, 6);
    write_uescape(esc, lo); out.push(esc, 6);
  }
}

// Dense escape table: for each byte, stores the escape sequence as a
// length-prefixed 7-byte payload.  len==0 means no escaping needed.
// Layout per entry: [len][c0][c1][c2][c3][c4][c5][c6]  (8 bytes total)
struct EscapeEntry { uint8_t len; char seq[7]; };

static constexpr auto ESCAPE_TAB = []{
  std::array<EscapeEntry, 256> tab{};
  const char* hex = "0123456789abcdef";
  for (int i = 0; i < 0x20; ++i) {
    tab[i].len    = 6;
    tab[i].seq[0] = '\\'; tab[i].seq[1] = 'u';
    tab[i].seq[2] = '0';  tab[i].seq[3] = '0';
    tab[i].seq[4] = hex[(i >> 4) & 0xF];
    tab[i].seq[5] = hex[i & 0xF];
  }
  tab['\b'] = {2, {'\\','b'}};
  tab['\f'] = {2, {'\\','f'}};
  tab['\n'] = {2, {'\\','n'}};
  tab['\r'] = {2, {'\\','r'}};
  tab['\t'] = {2, {'\\','t'}};
  tab['"']  = {2, {'\\','"'}};
  tab['\\'] = {2, {'\\','\\'}};
  return tab;
}();

// NEEDS_ESCAPE_TAB: quick bool check for find_escape_pos scalar fallback.
static constexpr auto NEEDS_ESCAPE_TAB = [] {
  std::array<bool, 256> tab{};
  for (int i = 0; i < 256; ++i) tab[i] = ESCAPE_TAB[i].len > 0;
  return tab;
}();

// Decode one UTF-8 sequence starting at p (p < end). Returns the code point
// and advances p past the sequence. On invalid/truncated input, returns the
// Unicode replacement character (U+FFFD) and advances p by one byte.
inline uint32_t decode_utf8(const char*& p, const char* end)
{
  auto c    = (unsigned char)*p;
  auto cont = [&](const char* q) {
    return q < end && ((unsigned char)*q & 0xC0) == 0x80;
  };

  if (c < 0x80) [[likely]] { ++p; return c; }

  if ((c & 0xE0) == 0xC0 && cont(p+1)) {
    auto cp = (uint32_t(c & 0x1F) << 6)
            | (uint32_t((unsigned char)p[1]) & 0x3F);
    p += 2;
    return cp >= 0x80 ? cp : 0xFFFD;
  }
  if ((c & 0xF0) == 0xE0 && cont(p+1) && cont(p+2)) {
    auto cp = (uint32_t(c & 0x0F) << 12)
            | (uint32_t((unsigned char)p[1] & 0x3F) << 6)
            |  uint32_t((unsigned char)p[2] & 0x3F);
    p += 3;
    return (cp >= 0x800 && (cp < 0xD800 || cp > 0xDFFF)) ? cp : 0xFFFD;
  }
  if ((c & 0xF8) == 0xF0 && cont(p+1) && cont(p+2) && cont(p+3)) {
    auto cp = (uint32_t(c & 0x07) << 18)
            | (uint32_t((unsigned char)p[1] & 0x3F) << 12)
            | (uint32_t((unsigned char)p[2] & 0x3F) << 6)
            |  uint32_t((unsigned char)p[3] & 0x3F);
    p += 4;
    return (cp >= 0x10000 && cp <= 0x10FFFF) ? cp : 0xFFFD;
  }

  ++p;
  return 0xFFFD;
}

//-----------------------------------------------------------------------------
// SIMD byte scanner — shared by the JSON, YAML, and CSV modules.
//
// find_byte: return a pointer to the first occurrence of `c` in [p, end),
//            or `end` if not found.
// Cascades AVX2 (32 B/iter) → SSE2 (16 B/iter) → scalar.
//-----------------------------------------------------------------------------

inline const char* find_byte(const char* p, const char* end, char c) noexcept
{
#if defined(__AVX2__)
  {
    const __m256i vc = _mm256_set1_epi8(c);
    while (p + 32 <= end) {
      __m256i  v    = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
      uint32_t mask = (uint32_t)_mm256_movemask_epi8(_mm256_cmpeq_epi8(v, vc));
      if (mask) return p + __builtin_ctz(mask);
      p += 32;
    }
  }
#endif
#if defined(__SSE2__)
  {
    const __m128i vc = _mm_set1_epi8(c);
    while (p + 16 <= end) {
      __m128i  v    = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
      unsigned mask = (unsigned)_mm_movemask_epi8(_mm_cmpeq_epi8(v, vc));
      if (mask) return p + __builtin_ctz(mask);
      p += 16;
    }
  }
#endif
  while (p < end && *p != c) ++p;
  return p;
}

// Return a pointer to the first byte in [p, end) that needs JSON/YAML escaping
// (control char < 0x20, '"', or '\').  Returns end if none found.
// Uses NEON (16 B/iter) → AVX2 (32 B/iter) → SSE2 (16 B/iter) → table.
// The bias trick converts unsigned c < 0x20 to a signed comparison:
// (c ^ 0x80) < 0xA0 (signed), which SSE2/NEON signed-compare handles.
inline const char* find_escape_pos(const char* p, const char* end) noexcept
{
#if defined(__ARM_NEON__)
  {
    const uint8x16_t vq    = vdupq_n_u8('"');
    const uint8x16_t vbs   = vdupq_n_u8('\\');
    const uint8x16_t vbias = vdupq_n_u8(0x80);
    const uint8x16_t vcmp  = vdupq_n_u8(0xA0);
    while (p + 16 <= end) {
      uint8x16_t v      = vld1q_u8(reinterpret_cast<const uint8_t*>(p));
      uint8x16_t biased = veorq_u8(v, vbias);
      uint8x16_t hit    = vorrq_u8(vorrq_u8(
        vreinterpretq_u8_s8(vcgtq_s8(vreinterpretq_s8_u8(vcmp),
                                     vreinterpretq_s8_u8(biased))),
        vceqq_u8(v, vq)),
        vceqq_u8(v, vbs));
      uint64x2_t h64 = vreinterpretq_u64_u8(hit);
      uint64_t   lo  = vgetq_lane_u64(h64, 0);
      uint64_t   hi  = vgetq_lane_u64(h64, 1);
      if (lo | hi) {
        if (lo) return p + (__builtin_ctzll(lo) >> 3);
        return p + 8 + (__builtin_ctzll(hi) >> 3);
      }
      p += 16;
    }
  }
#endif
#if defined(__AVX2__)
  {
    const __m256i vq    = _mm256_set1_epi8('"');
    const __m256i vbs   = _mm256_set1_epi8('\\');
    const __m256i vbias = _mm256_set1_epi8(-128);
    const __m256i vcmp  = _mm256_set1_epi8(-96);
    while (p + 32 <= end) {
      __m256i  v      = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
      __m256i  biased = _mm256_xor_si256(v, vbias);
      uint32_t mask   = (uint32_t)_mm256_movemask_epi8(_mm256_or_si256(_mm256_or_si256(
        _mm256_cmpgt_epi8(vcmp, biased),
        _mm256_cmpeq_epi8(v, vq)),
        _mm256_cmpeq_epi8(v, vbs)));
      if (mask) return p + __builtin_ctz(mask);
      p += 32;
    }
  }
#endif
#if defined(__SSE2__)
  {
    const __m128i vq    = _mm_set1_epi8('"');
    const __m128i vbs   = _mm_set1_epi8('\\');
    const __m128i vbias = _mm_set1_epi8(-128);
    const __m128i vcmp  = _mm_set1_epi8(-96);
    while (p + 16 <= end) {
      __m128i  v      = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
      __m128i  biased = _mm_xor_si128(v, vbias);
      unsigned mask   = (unsigned)_mm_movemask_epi8(_mm_or_si128(_mm_or_si128(
        _mm_cmpgt_epi8(vcmp, biased),
        _mm_cmpeq_epi8(v, vq)),
        _mm_cmpeq_epi8(v, vbs)));
      if (mask) return p + __builtin_ctz(mask);
      p += 16;
    }
  }
#endif
  while (p < end && !NEEDS_ESCAPE_TAB[(unsigned char)*p]) ++p;
  return p;
}

//-----------------------------------------------------------------------------
// Minimal strptime-like parser, used to turn a `{datetime, InputFormat}`
// field into Unix epoch seconds (UTC).
//
// Supported directives: %Y %y %m %d %H %M %S %f %z, and literal `%%`.
// Any other character in the format must match the input literally; a space
// in the format matches a run of one-or-more whitespace characters in the
// input (as with strptime).
//-----------------------------------------------------------------------------

namespace datetime {

  // Days since 1970-01-01 for the given proleptic-Gregorian civil date.
  // Howard Hinnant's `days_from_civil` algorithm (public domain).
  inline int64_t days_from_civil(int64_t y, unsigned m, unsigned d)
  {
    y -= m <= 2;
    const int64_t era = (y >= 0 ? y : y - 399) / 400;
    const unsigned yoe = static_cast<unsigned>(y - era * 400);          // [0, 399]
    const unsigned doy = (153 * (m + (m > 2 ? -3 : 9)) + 2) / 5 + d - 1; // [0, 365]
    const unsigned doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;          // [0, 146096]
    return era * 146097 + static_cast<int64_t>(doe) - 719468;
  }

  inline bool parse_uint(const char*& p, const char* end, int max_digits, int& out)
  {
    const char* start = p;
    const char* cend  = std::min(end, p+max_digits);
    out = 0;
    for (out = 0; p < cend && *p >= '0' && *p <= '9'; ++p)
      out = out * 10 + (*p - '0');
    return p > start;
  }

  // Parses `input` according to `format` (a strptime-like format string) and
  // returns the corresponding Unix epoch time in seconds (UTC), or
  // `std::nullopt` if the input doesn't match the format.
  // NOTE: we could use std::get_time(), but it's locale-dependent and doesn't
  // support fractional seconds or timezone offsets, so we implement our own
  inline std::optional<int64_t> parse(std::string_view input, std::string_view format)
  {
    const char* p   = input.data();
    const char* end = p + input.size();
    const char* f   = format.data();
    const char* fe  = f + format.size();

    int year = 1970, month = 1, day = 1, hour = 0, minute = 0, second = 0;
    bool have_date = false;
    int tz_offset_sec = 0;

    while (f < fe) {
      char fc = *f;

      if (fc == '%' && f + 1 < fe) {
        char spec = f[1];
        f += 2;
        switch (spec) {
          case 'Y': {
            int v;
            if (!parse_uint(p, end, 4, v)) return std::nullopt;
            year = v; have_date = true;
            break;
          }
          case 'y': {
            int v;
            if (!parse_uint(p, end, 2, v)) return std::nullopt;
            year = (v <= 68) ? 2000 + v : 1900 + v; have_date = true;
            break;
          }
          case 'm': {
            int v;
            if (!parse_uint(p, end, 2, v) || v < 1 || v > 12) return std::nullopt;
            month = v; have_date = true;
            break;
          }
          case 'd': {
            int v;
            if (!parse_uint(p, end, 2, v) || v < 1 || v > 31) return std::nullopt;
            day = v; have_date = true;
            break;
          }
          case 'H': {
            int v;
            if (!parse_uint(p, end, 2, v) || v > 23) return std::nullopt;
            hour = v;
            break;
          }
          case 'M': {
            int v;
            if (!parse_uint(p, end, 2, v) || v > 59) return std::nullopt;
            minute = v;
            break;
          }
          case 'S': {
            int v;
            if (!parse_uint(p, end, 2, v) || v > 60) return std::nullopt;
            second = v;
            break;
          }
          case 'f': {
            // Fractional seconds — consume digits, discard.
            int v;
            if (!parse_uint(p, end, 9, v)) return std::nullopt;
            break;
          }
          case 'z': {
            if (p < end && (*p == 'Z' || *p == 'z')) { ++p; tz_offset_sec = 0; break; }
            if (p >= end || (*p != '+' && *p != '-'))
              return std::nullopt;
            auto neg = *p == '-';
            ++p;
            int hh, mm = 0;
            if (!parse_uint(p, end, 2, hh))
              return std::nullopt;
            if (p < end && *p == ':') ++p;
            if (p < end && *p >= '0' && *p <= '9' && !parse_uint(p, end, 2, mm))
              return std::nullopt;
            tz_offset_sec = (hh * 3600 + mm * 60) * (neg ? -1 : 1);
            break;
          }
          case '%':
            if (p >= end || *p != '%')
              return std::nullopt;
            ++p;
            break;
          default:
            return std::nullopt;
        }
        continue;
      }

      if (fc == ' ') {
        if (p >= end || !std::isspace(static_cast<uint8_t>(*p))) [[unlikely]]
          return std::nullopt;
        while (p < end && std::isspace(static_cast<uint8_t>(*p))) ++p;
        ++f;
        continue;
      }

      if (p >= end || *p != fc) [[unlikely]]
        return std::nullopt;
      ++p; ++f;
    }

    if (p != end)   return std::nullopt; // trailing input not consumed by format
    if (!have_date) return std::nullopt;

    auto days = days_from_civil(year, static_cast<unsigned>(month), static_cast<unsigned>(day));
    auto secs = days * 86400 + hour * 3600 + minute * 60 + second - tz_offset_sec;
    return secs;
  }

} // namespace datetime

} // namespace glz