lib/fastest_tiktoken.ex

defmodule FastestTiktoken do
  @moduledoc """
  Fast Elixir bindings for the pure-Rust `tiktoken` crate.

  `FastestTiktoken` provides OpenAI-compatible tokenization from Elixir while
  keeping runtime calls explicit and safe: every operation returns
  `{:ok, value}` or `{:error, reason}`.

  All tokenization functions require a selector through either `:model` or
  `:encoding`.

      iex> FastestTiktoken.count_tokens("hello world", model: "gpt-4o")
      {:ok, 2}

      iex> FastestTiktoken.encode("hello world", encoding: :cl100k_base)
      {:ok, [15339, 1917]}

  ## Parity

  The public behavior is parity-tested against official OpenAI `tiktoken`
  `0.13.0` for the OpenAI encodings and API surfaces exposed here. That
  includes model mapping, GPT-2/r50k fixtures, regex edge cases, roundtrips,
  special-token behavior, `o200k_harmony`, large inputs, and batch helpers.

  Under the hood, this library wraps the high-performance pure-Rust
  [`tiktoken`](https://crates.io/crates/tiktoken) crate rather than older
  wrappers around `tiktoken-rs`.
  """

  alias FastestTiktoken.Native

  @encoding_aliases %{
    "gpt2" => "r50k_base"
  }

  @official_model_to_encoding %{
    "o1" => "o200k_base",
    "o3" => "o200k_base",
    "o4-mini" => "o200k_base",
    "gpt-5" => "o200k_base",
    "gpt-4.1" => "o200k_base",
    "gpt-4o" => "o200k_base",
    "gpt-4" => "cl100k_base",
    "gpt-3.5-turbo" => "cl100k_base",
    "gpt-3.5" => "cl100k_base",
    "gpt-35-turbo" => "cl100k_base",
    "davinci-002" => "cl100k_base",
    "babbage-002" => "cl100k_base",
    "text-embedding-ada-002" => "cl100k_base",
    "text-embedding-3-small" => "cl100k_base",
    "text-embedding-3-large" => "cl100k_base",
    "text-davinci-003" => "p50k_base",
    "text-davinci-002" => "p50k_base",
    "text-davinci-001" => "r50k_base",
    "text-curie-001" => "r50k_base",
    "text-babbage-001" => "r50k_base",
    "text-ada-001" => "r50k_base",
    "davinci" => "r50k_base",
    "curie" => "r50k_base",
    "babbage" => "r50k_base",
    "ada" => "r50k_base",
    "code-davinci-002" => "p50k_base",
    "code-davinci-001" => "p50k_base",
    "code-cushman-002" => "p50k_base",
    "code-cushman-001" => "p50k_base",
    "davinci-codex" => "p50k_base",
    "cushman-codex" => "p50k_base",
    "text-davinci-edit-001" => "p50k_edit",
    "code-davinci-edit-001" => "p50k_edit",
    "text-similarity-davinci-001" => "r50k_base",
    "text-similarity-curie-001" => "r50k_base",
    "text-similarity-babbage-001" => "r50k_base",
    "text-similarity-ada-001" => "r50k_base",
    "text-search-davinci-doc-001" => "r50k_base",
    "text-search-curie-doc-001" => "r50k_base",
    "text-search-babbage-doc-001" => "r50k_base",
    "text-search-ada-doc-001" => "r50k_base",
    "code-search-babbage-code-001" => "r50k_base",
    "code-search-ada-code-001" => "r50k_base",
    "gpt2" => "gpt2",
    "gpt-2" => "gpt2"
  }

  @official_model_prefix_to_encoding [
    {"o1-", "o200k_base"},
    {"o3-", "o200k_base"},
    {"o4-mini-", "o200k_base"},
    {"gpt-5-", "o200k_base"},
    {"gpt-4.5-", "o200k_base"},
    {"gpt-4.1-", "o200k_base"},
    {"chatgpt-4o-", "o200k_base"},
    {"gpt-4o-", "o200k_base"},
    {"gpt-4-", "cl100k_base"},
    {"gpt-3.5-turbo-", "cl100k_base"},
    {"gpt-35-turbo-", "cl100k_base"},
    {"gpt-oss-", "o200k_harmony"},
    {"ft:gpt-4o", "o200k_base"},
    {"ft:gpt-4", "cl100k_base"},
    {"ft:gpt-3.5-turbo", "cl100k_base"},
    {"ft:davinci-002", "cl100k_base"},
    {"ft:babbage-002", "cl100k_base"}
  ]

  @type encoding :: atom() | String.t()
  @type allowed_special :: :all | [String.t()]
  @type selector_opts :: keyword() | map()
  @type reason ::
          :ambiguous_selector
          | :invalid_batch
          | :invalid_allowed_special
          | :invalid_model
          | :invalid_opts
          | :invalid_text
          | :invalid_token_ids
          | :missing_selector
          | {:decode_failed, String.t()}
          | {:native_error, String.t()}
          | {:unsupported_encoding, String.t()}
          | {:unsupported_model, String.t()}

  @doc """
  Returns all encoding names compiled into the native tokenizer.

  The list includes `gpt2` as an OpenAI-compatible alias for `r50k_base`.
  """
  @spec list_encodings() :: {:ok, [String.t()]} | {:error, reason()}
  def list_encodings do
    with {:ok, encodings} <- Native.list_encodings() do
      {:ok, encodings |> Kernel.++(Map.keys(@encoding_aliases)) |> Enum.uniq() |> Enum.sort()}
    end
  end

  @doc """
  Resolves a model name to the encoding used by the native tokenizer.

  This mirrors the official OpenAI `tiktoken` model mapping for supported
  encodings, including common versioned model prefixes.

      iex> FastestTiktoken.encoding_for_model("gpt-4o-2024-05-13")
      {:ok, "o200k_base"}

      iex> FastestTiktoken.encoding_for_model("text-davinci-003")
      {:ok, "p50k_base"}

      iex> FastestTiktoken.encoding_for_model("gpt-oss-120b")
      {:ok, "o200k_harmony"}
  """
  @spec encoding_for_model(String.t()) :: {:ok, String.t()} | {:error, reason()}
  def encoding_for_model(model) when is_binary(model) and model != "" do
    case official_encoding_name_for_model(model) do
      {:ok, encoding} ->
        if supported_encoding_name?(encoding) do
          {:ok, encoding}
        else
          {:error, {:unsupported_encoding, encoding}}
        end

      :error ->
        case Native.encoding_for_model(model) do
          {:ok, encoding} -> {:ok, encoding}
          {:error, _reason} -> {:error, {:unsupported_model, model}}
        end
    end
  end

  def encoding_for_model(_model), do: {:error, :invalid_model}

  @doc """
  Encodes text into token ids.

      iex> FastestTiktoken.encode("hello world", model: "gpt-4o")
      {:ok, [24912, 2375]}

      iex> FastestTiktoken.encode("hello world", encoding: :cl100k_base)
      {:ok, [15339, 1917]}
  """
  @spec encode(String.t(), selector_opts()) :: {:ok, [non_neg_integer()]} | {:error, reason()}
  def encode(text, opts) when is_binary(text) do
    with {:ok, encoding, mode, allowed_special} <- normalize_call_opts(opts) do
      map_native_error(Native.encode(encoding, text, mode, allowed_special))
    end
  end

  def encode(_text, _opts), do: {:error, :invalid_text}

  @doc """
  Encodes text while treating special token strings as ordinary text.

  This matches the official `encode_ordinary` behavior.

      iex> FastestTiktoken.encode_ordinary("hello <|endoftext|>", encoding: :cl100k_base)
      {:ok, [15339, 83739, 8862, 728, 428, 91, 29]}
  """
  @spec encode_ordinary(String.t(), selector_opts()) ::
          {:ok, [non_neg_integer()]} | {:error, reason()}
  def encode_ordinary(text, opts) when is_binary(text) do
    opts
    |> put_allowed_special([])
    |> then(&encode(text, &1))
  end

  def encode_ordinary(_text, _opts), do: {:error, :invalid_text}

  @doc """
  Decodes token ids into text using the selected encoding.

      iex> FastestTiktoken.decode([24912, 2375], model: "gpt-4o")
      {:ok, "hello world"}
  """
  @spec decode([non_neg_integer()], selector_opts()) :: {:ok, String.t()} | {:error, reason()}
  def decode(ids, opts) do
    with {:ok, ids} <- normalize_token_ids(ids),
         {:ok, encoding} <- resolve_encoding(opts) do
      case Native.decode(encoding, ids) do
        {:ok, text} -> {:ok, text}
        {:error, reason} -> {:error, {:decode_failed, reason}}
      end
    end
  end

  @doc """
  Counts tokens for text using the selected encoding.

  With `allowed_special: []`, this uses the native crate's zero-allocation count path.

      iex> FastestTiktoken.count_tokens("表情符号是\\n🦜🔗", model: "gpt-4o")
      {:ok, 11}
  """
  @spec count_tokens(String.t(), selector_opts()) ::
          {:ok, non_neg_integer()} | {:error, reason()}
  def count_tokens(text, opts) when is_binary(text) do
    with {:ok, encoding, mode, allowed_special} <- normalize_call_opts(opts) do
      map_native_error(Native.count_tokens(encoding, text, mode, allowed_special))
    end
  end

  def count_tokens(_text, _opts), do: {:error, :invalid_text}

  @doc """
  Encodes text and decodes each token id back into a token piece.

  Some valid token ids do not decode to valid UTF-8 in isolation. In that case,
  this function returns `{:error, {:decode_failed, reason}}`.

      iex> FastestTiktoken.split_tokens("hello world", model: "gpt-4o")
      {:ok, ["hello", " world"]}
  """
  @spec split_tokens(String.t(), selector_opts()) :: {:ok, [String.t()]} | {:error, reason()}
  def split_tokens(text, opts) when is_binary(text) do
    with {:ok, encoding, mode, allowed_special} <- normalize_call_opts(opts) do
      case Native.split_tokens(encoding, text, mode, allowed_special) do
        {:ok, pieces} -> {:ok, pieces}
        {:error, reason} -> {:error, {:decode_failed, reason}}
      end
    end
  end

  def split_tokens(_text, _opts), do: {:error, :invalid_text}

  @doc """
  Encodes a batch of texts with the same selector options.

      iex> FastestTiktoken.encode_batch(["hello world"], encoding: :cl100k_base)
      {:ok, [[15339, 1917]]}
  """
  @spec encode_batch([String.t()], selector_opts()) ::
          {:ok, [[non_neg_integer()]]} | {:error, reason()}
  def encode_batch(texts, opts) when is_list(texts) do
    map_batch(texts, &encode(&1, opts))
  end

  def encode_batch(_texts, _opts), do: {:error, :invalid_batch}

  @doc """
  Encodes a batch of texts while treating special token strings as ordinary text.
  """
  @spec encode_ordinary_batch([String.t()], selector_opts()) ::
          {:ok, [[non_neg_integer()]]} | {:error, reason()}
  def encode_ordinary_batch(texts, opts) when is_list(texts) do
    map_batch(texts, &encode_ordinary(&1, opts))
  end

  def encode_ordinary_batch(_texts, _opts), do: {:error, :invalid_batch}

  @doc """
  Decodes a batch of token-id lists with the same selector options.

      iex> FastestTiktoken.decode_batch([[15339, 1917]], encoding: :cl100k_base)
      {:ok, ["hello world"]}
  """
  @spec decode_batch([[non_neg_integer()]], selector_opts()) ::
          {:ok, [String.t()]} | {:error, reason()}
  def decode_batch(batch, opts) when is_list(batch) do
    map_batch(batch, &decode(&1, opts))
  end

  def decode_batch(_batch, _opts), do: {:error, :invalid_batch}

  defp normalize_call_opts(opts) do
    with {:ok, opts} <- normalize_opts(opts),
         {:ok, encoding} <- resolve_encoding_from_opts(opts),
         {:ok, mode, allowed_special} <- resolve_allowed_special_from_opts(opts) do
      {:ok, encoding, mode, allowed_special}
    end
  end

  defp resolve_encoding(opts) do
    with {:ok, opts} <- normalize_opts(opts) do
      resolve_encoding_from_opts(opts)
    end
  end

  defp resolve_encoding_from_opts(opts) do
    model = get_opt(opts, :model)
    encoding = get_opt(opts, :encoding)

    case {model, encoding} do
      {nil, nil} ->
        {:error, :missing_selector}

      {model, nil} when is_binary(model) and model != "" ->
        with {:ok, encoding} <- encoding_for_model(model) do
          resolve_encoding_name(encoding)
        end

      {model, nil} when is_atom(model) ->
        with {:ok, encoding} <- model |> Atom.to_string() |> encoding_for_model() do
          resolve_encoding_name(encoding)
        end

      {nil, encoding} ->
        resolve_encoding_name(encoding)

      {_model, _encoding} ->
        {:error, :ambiguous_selector}
    end
  end

  defp resolve_encoding_name(encoding) when is_atom(encoding),
    do: encoding |> Atom.to_string() |> resolve_encoding_name()

  defp resolve_encoding_name("gpt2"), do: {:ok, "r50k_base"}

  defp resolve_encoding_name(encoding) when is_binary(encoding) and encoding != "" do
    if Native.encoding_exists(encoding) do
      {:ok, encoding}
    else
      {:error, {:unsupported_encoding, encoding}}
    end
  end

  defp resolve_encoding_name(_encoding), do: {:error, {:unsupported_encoding, ""}}

  defp resolve_allowed_special_from_opts(opts) do
    case get_opt(opts, :allowed_special, []) do
      :all -> {:ok, 1, []}
      [] -> {:ok, 0, []}
      allowed when is_list(allowed) -> normalize_allowed_special_list(allowed)
      _other -> {:error, :invalid_allowed_special}
    end
  end

  defp normalize_allowed_special_list(allowed) do
    if Enum.all?(allowed, &is_binary/1) do
      {:ok, 2, allowed}
    else
      {:error, :invalid_allowed_special}
    end
  end

  defp normalize_token_ids(ids) when is_list(ids) do
    if Enum.all?(ids, &valid_token_id?/1) do
      {:ok, ids}
    else
      {:error, :invalid_token_ids}
    end
  end

  defp normalize_token_ids(_ids), do: {:error, :invalid_token_ids}

  defp valid_token_id?(id), do: is_integer(id) and id >= 0 and id <= 4_294_967_295

  defp normalize_opts(opts) when is_list(opts) do
    if Keyword.keyword?(opts), do: {:ok, opts}, else: {:error, :invalid_opts}
  end

  defp normalize_opts(%{} = opts), do: {:ok, opts}
  defp normalize_opts(_opts), do: {:error, :invalid_opts}

  defp get_opt(opts, key, default \\ nil)
  defp get_opt(opts, key, default) when is_list(opts), do: Keyword.get(opts, key, default)
  defp get_opt(opts, key, default) when is_map(opts), do: Map.get(opts, key, default)

  defp put_allowed_special(opts, value) when is_list(opts),
    do: Keyword.put(opts, :allowed_special, value)

  defp put_allowed_special(%{} = opts, value), do: Map.put(opts, :allowed_special, value)
  defp put_allowed_special(opts, _value), do: opts

  defp map_native_error({:ok, value}), do: {:ok, value}
  defp map_native_error({:error, reason}), do: {:error, {:native_error, reason}}

  defp map_batch(values, fun) do
    Enum.reduce_while(values, {:ok, []}, fn value, {:ok, acc} ->
      case fun.(value) do
        {:ok, mapped} -> {:cont, {:ok, [mapped | acc]}}
        {:error, reason} -> {:halt, {:error, reason}}
      end
    end)
    |> case do
      {:ok, values} -> {:ok, Enum.reverse(values)}
      {:error, reason} -> {:error, reason}
    end
  end

  defp official_encoding_name_for_model(model) do
    case Map.fetch(@official_model_to_encoding, model) do
      {:ok, encoding} ->
        {:ok, encoding}

      :error ->
        @official_model_prefix_to_encoding
        |> Enum.find_value(:error, fn {prefix, encoding} ->
          if String.starts_with?(model, prefix), do: {:ok, encoding}
        end)
    end
  end

  defp supported_encoding_name?("gpt2"), do: true

  defp supported_encoding_name?(encoding) when is_binary(encoding),
    do: Native.encoding_exists(encoding)
end