lib/kuddle/tokenizer.ex

defmodule Kuddle.Tokenizer do
  @moduledoc """
  Intermediate process of converting a KDL document into some basic tokens that can be parsed.
  """
  @type open_block_token :: {:open_block, unused::integer()}

  @type close_block_token :: {:close_block, unused::integer()}

  @type slashdash_token :: {:slashdash, unused::integer()}

  @type comment_type :: :c | :c_multiline

  @type comment_token :: {:comment, {comment_type(), String.t()}}

  @type dquote_string_token :: {:dquote_string, String.t()}

  @type raw_string_token :: {:raw_string, String.t()}

  @type space_token :: {:space, {String.t(), len::non_neg_integer()}}

  @type newline_token :: {:nl, unused::integer()}

  @type equal_token :: {:=, unused::integer()}

  @type semicolon_token :: {:sc, unused::integer()}

  @type fold_token :: {:fold, unused::integer()}

  @type term_token :: {:term, String.t()}

  @type token :: open_block_token()
               | close_block_token()
               | slashdash_token()
               | comment_token()
               | dquote_string_token()
               | raw_string_token()
               | space_token()
               | newline_token()
               | equal_token()
               | semicolon_token()
               | fold_token()
               | term_token()

  @type tokens :: [token()]

  @spec tokenize(String.t()) ::
          {:ok, tokens(), rest::String.t()}
          | {:error, term()}
  def tokenize(blob) when is_binary(blob) do
    do_tokenize(blob, :default, nil, [])
  end

  defp do_tokenize(<<>>, :default, nil, doc) do
    {:ok, Enum.reverse(doc), ""}
  end

  defp do_tokenize(<<"(", rest::binary>>, :default, nil, doc) do
    case String.split(rest, ")", parts: 2) do
      [annotation, rest] ->
        do_tokenize(rest, :default, nil, [{:annotation, annotation} | doc])

      [_annotation] ->
        {:error, :unexpected_annotation}
    end
  end

  defp do_tokenize(<<"{", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:open_block, 0} | doc])
  end

  defp do_tokenize(<<"}", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:close_block, 0} | doc])
  end

  defp do_tokenize(<<"/-", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:slashdash, 0} | doc])
  end

  defp do_tokenize(<<"/*", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, {:comment, :c_multiline, 0}, [], doc)
  end

  defp do_tokenize(<<"/*", rest::binary>>, {:comment, :c_multiline, depth}, acc, doc) do
    do_tokenize(rest, {:comment, :c_multiline, depth + 1}, ["/*" | acc], doc)
  end

  defp do_tokenize(<<"*/", rest::binary>>, {:comment, :c_multiline, 0}, acc, doc) do
    comment = IO.iodata_to_binary(Enum.reverse(acc))
    do_tokenize(rest, :default, nil, [{:comment, {:c_multiline, comment}} | doc])
  end

  defp do_tokenize(<<"*/", rest::binary>>, {:comment, :c_multiline, depth}, acc, doc) do
    do_tokenize(rest, {:comment, :c_multiline, depth - 1}, ["*/" | acc], doc)
  end

  defp do_tokenize(<<c::utf8, rest::binary>>, {:comment, :c_multiline, _} = s, acc, doc) do
    do_tokenize(rest, s, [<<c::utf8>> | acc], doc)
  end

  defp do_tokenize(<<"//", rest::binary>>, :default, nil, doc) do
    {comment, rest} =
      case String.split(rest, "\n", parts: 2) do
        [comment, rest] ->
          {comment, rest}

        [comment] ->
          {comment, ""}
      end

    do_tokenize(rest, :default, nil, [{:comment, {:c, comment}} | doc])
  end

  defp do_tokenize(<<"\"", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :dquote_string, [], doc)
  end

  # double quote string
  defp do_tokenize(<<"\"", rest::binary>>, :dquote_string, acc, doc) do
    string = IO.iodata_to_binary(Enum.reverse(acc))
    do_tokenize(rest, :default, nil, [{:dquote_string, string} | doc])
  end

  defp do_tokenize(<<"\\u{", rest::binary>>, :dquote_string, acc, doc) do
    [unicode, rest] = String.split(rest, "}", parts: 2)
    unicode = String.to_integer(unicode, 16)

    do_tokenize(rest, :dquote_string, [<<unicode::utf8>> | acc], doc)
  end

  defp do_tokenize(<<"\\\"", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\"" | acc], doc)
  end

  defp do_tokenize(<<"\\r", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\r" | acc], doc)
  end

  defp do_tokenize(<<"\\n", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\n" | acc], doc)
  end

  defp do_tokenize(<<"\\b", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\b" | acc], doc)
  end

  defp do_tokenize(<<"\\f", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\f" | acc], doc)
  end

  defp do_tokenize(<<"\\s", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\s" | acc], doc)
  end

  defp do_tokenize(<<"\\t", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\t" | acc], doc)
  end

  defp do_tokenize(<<"\\\\", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["\\" | acc], doc)
  end

  defp do_tokenize(<<"\\/", rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, ["/" | acc], doc)
  end

  defp do_tokenize(<<c::utf8, rest::binary>>, :dquote_string, acc, doc) do
    do_tokenize(rest, :dquote_string, [<<c::utf8>> | acc], doc)
  end

  # raw string
  defp do_tokenize(<<"r\"", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, {:raw_string, "\""}, [], doc)
  end

  defp do_tokenize(<<"r#", rest::binary>>, :default, nil, doc) do
    len = byte_size(rest)
    rest = String.trim_leading(rest, "#")
    hash_count = len - byte_size(rest) + 1
    terminator = "\"" <> String.duplicate("#", hash_count)
    <<"\"", rest::binary>> = rest
    do_tokenize(rest, {:raw_string, terminator}, [], doc)
  end

  defp do_tokenize(<<>>, {:raw_string, _terminator}, acc, _doc) do
    {:error, {:unterminated_raw_string, Enum.reverse(acc)}}
  end

  defp do_tokenize(<<"\"", rest::binary>> = str, {:raw_string, terminator} = state, acc, doc) do
    if String.starts_with?(str, terminator) do
      rest = String.trim_leading(str, terminator)
      string = IO.iodata_to_binary(Enum.reverse(acc))
      do_tokenize(rest, :default, nil, [{:raw_string, string} | doc])
    else
      do_tokenize(rest, state, ["\"" | acc], doc)
    end
  end

  defp do_tokenize(<<c::utf8, rest::binary>>, {:raw_string, _} = state, acc, doc) do
    do_tokenize(rest, state, [<<c::utf8>> | acc], doc)
  end

  defp do_tokenize(<<"\s", rest::binary>>, :default, nil, doc) do
    len = byte_size(rest)
    rest = String.trim_leading(rest, "\s")
    len = len - byte_size(rest) + 1
    do_tokenize(rest, :default, nil, [{:space, len} | doc])
  end

  defp do_tokenize(<<"\t", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:space, "\t"} | doc])
  end

  defp do_tokenize(<<"\u{00A0}", rest::binary>>, :default, nil, doc) do
    # No-Break Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{1680}", rest::binary>>, :default, nil, doc) do
    # Ogham Space Mark
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2000}", rest::binary>>, :default, nil, doc) do
    # En Quad
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2001}", rest::binary>>, :default, nil, doc) do
    # Em Quad
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2002}", rest::binary>>, :default, nil, doc) do
    # En Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2003}", rest::binary>>, :default, nil, doc) do
    # Em Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2004}", rest::binary>>, :default, nil, doc) do
    # Three-Per-Em Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2005}", rest::binary>>, :default, nil, doc) do
    # Four-Per-Em Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2006}", rest::binary>>, :default, nil, doc) do
    # Six-Per-Em Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2007}", rest::binary>>, :default, nil, doc) do
    # Figure Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2008}", rest::binary>>, :default, nil, doc) do
    # Punctuation Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{2009}", rest::binary>>, :default, nil, doc) do
    # Thin Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{200A}", rest::binary>>, :default, nil, doc) do
    # Hair Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{202F}", rest::binary>>, :default, nil, doc) do
    # Narrow No-Break Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{205F}", rest::binary>>, :default, nil, doc) do
    # Medium Mathematical Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\u{3000}", rest::binary>>, :default, nil, doc) do
    # Ideographic Space
    do_tokenize(rest, :default, nil, [{:space, 1} | doc])
  end

  defp do_tokenize(<<"\r\n", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\r", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\n", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\f", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\u{2028}", rest::binary>>, :default, nil, doc) do
    # Line Separator
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\u{2029}", rest::binary>>, :default, nil, doc) do
    # Paragraph Separator
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"\u{0085}", rest::binary>>, :default, nil, doc) do
    # Next-Line
    do_tokenize(rest, :default, nil, [{:nl, 1} | doc])
  end

  defp do_tokenize(<<"=", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:=, 0} | doc])
  end

  defp do_tokenize(<<";", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:sc, 0} | doc])
  end

  defp do_tokenize(<<"\\", rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :default, nil, [{:fold, 0} | doc])
  end

  @non_identifier_characters [?=, ?\n, ?\r, ?\s, ?\\, ?<, ?>, ?{, ?}, ?;, ?[, ?], ?(, ?), ?=, ?,, ?"]

  defp do_tokenize(<<c::utf8, _rest::binary>> = rest, :default, nil, doc) when c in @non_identifier_characters do
    {:ok, Enum.reverse(doc), rest}
  end

  defp do_tokenize(<<c::utf8, rest::binary>>, :default, nil, doc) do
    do_tokenize(rest, :term, [<<c::utf8>>], doc)
  end

  defp do_tokenize(<<>> = rest, :term, acc, doc) do
    value = IO.iodata_to_binary(Enum.reverse(acc))
    do_tokenize(rest, :default, nil, [{:term, value} | doc])
  end

  defp do_tokenize(<<c::utf8, _rest::binary>> = rest, :term, acc, doc)
        when c in @non_identifier_characters or
             c >= 0x10FFFF do
    value = IO.iodata_to_binary(Enum.reverse(acc))
    do_tokenize(rest, :default, nil, [{:term, value} | doc])
  end

  defp do_tokenize(<<c::utf8, rest::binary>>, :term, acc, doc) do
    do_tokenize(rest, :term, [<<c::utf8>> | acc], doc)
  end
end