lib/kuddle/decoder.ex

defmodule Kuddle.Decoder do
  @moduledoc """
  Tokenizes and parses KDL documents into kuddle documents.
  """
  alias Kuddle.Value
  alias Kuddle.Node

  import Kuddle.Tokenizer
  import Kuddle.Utils

  @typedoc """
  Parsed tokens from the Tokenizer, these will be processed and converted into the final nodes for
  the document.
  """
  @type tokens :: Kuddle.Tokenizer.tokens()

  @typedoc """
  A single node in the Kuddle document
  """
  @type document_node :: Node.t()

  @typedoc """
  A kuddle document is a list of Kuddle Nodes
  """
  @type document :: [document_node()]

  @doc """
  Tokenize and parse a given KDL document.

  If successful, it will return `{:ok, document, tokens}`, where document is the list of nodes that
  were parsed and tokens are any unparsed tokens.
  """
  @spec decode(String.t()) ::
          {:ok, document(), tokens()}
          | {:error, term()}
  def decode(blob) when is_binary(blob) do
    case tokenize(blob) do
      {:ok, tokens, ""} ->
        decode(tokens)

      {:error, _} = err ->
        err
    end
  end

  def decode(tokens) when is_list(tokens) do
    parse(tokens, {:default, 0}, [], [])
  end

  defp parse([], {:default, 0}, [], doc) do
    handle_parse_exit([], doc)
  end

  defp parse([{:annotation, _value} = annotation | tokens], {:default, _} = state, acc, doc) do
    parse(tokens, state, [annotation | acc], doc)
  end

  defp parse([{:slashdash, _} | tokens], {:default, _} = state, acc, doc) do
    # add the slashdash to the document accumulator
    # when the next parse is done, the slashdash will cause the next item in the accumulator to
    # be dropped
    parse(tokens, state, acc, [:slashdash | doc])
  end

  defp parse([{:comment, _} | tokens], {:default, _} = state, acc, doc) do
    parse(tokens, state, acc, doc)
  end

  defp parse([{:fold, _} | tokens], {:default, _} = state, acc, doc) do
    parse(fold_leading_tokens(tokens), state, acc, doc)
  end

  defp parse([{:sc, _} | tokens], {:default, _} = state, acc, doc) do
    # loose semi-colon
    parse(tokens, state, acc, doc)
  end

  defp parse([{:nl, _} | tokens], {:default, _} = state, acc, doc) do
    # trim leading newlines
    parse(tokens, state, acc, doc)
  end

  defp parse([{:space, _} | tokens], {:default, _} = state, acc, doc) do
    # trim leading space
    parse(tokens, state, acc, doc)
  end

  defp parse([{:term, name} | tokens], {:default, depth}, acc, doc) do
    # node
    annotations = extract_annotations(acc)
    parse(tokens, {:node, depth}, {name, annotations, []}, doc)
  end

  defp parse([{:dquote_string, name} | tokens], {:default, depth}, acc, doc) do
    # double quote initiated node
    annotations = extract_annotations(acc)
    parse(tokens, {:node, depth}, {name, annotations, []}, doc)
  end

  defp parse([{:raw_string, name} | tokens], {:default, depth}, acc, doc) do
    # raw string node
    annotations = extract_annotations(acc)
    parse(tokens, {:node, depth}, {name, annotations, []}, doc)
  end

  defp parse([{:slashdash, _} | tokens], {:node, _} = state, {name, annotations, attrs}, doc) do
    parse(tokens, state, {name, annotations, [:slashdash | attrs]}, doc)
  end

  defp parse([{:comment, _} | tokens], {:node, _} = state, acc, doc) do
    # trim comments
    parse(tokens, state, acc, doc)
  end

  defp parse([{:space, _} | tokens], {:node, _} = state, acc, doc) do
    # trim leading spaces in node
    parse(tokens, state, acc, doc)
  end

  defp parse([{:fold, _} | tokens], {:node, _} = state, acc, doc) do
    parse(fold_leading_tokens(tokens), state, acc, doc)
  end

  defp parse([{token_type, _} | tokens], {:node, depth}, {name, node_annotations, attrs}, doc) when token_type in [:nl, :sc] do
    node = %Node{
      name: name,
      annotations: node_annotations,
      attributes: resolve_node_attributes(attrs),
      children: nil,
    }
    parse(tokens, {:default, depth}, [], [node | doc])
  end

  defp parse([{:open_block, _} | tokens], {:node, depth}, {name, node_annotations, attrs}, doc) do
    case parse(tokens, {:default, depth + 1}, [], []) do
      {:ok, children, tokens} ->
        case trim_leading_space(tokens) do
          [{:close_block, _} | tokens] ->
            node =
              case attrs do
                [:slashdash | attrs] ->
                  # discard the children
                  %Node{
                    name: name,
                    annotations: node_annotations,
                    attributes: resolve_node_attributes(attrs),
                    children: nil,
                  }

                attrs ->
                  %Node{
                    name: name,
                    annotations: node_annotations,
                    attributes: resolve_node_attributes(attrs),
                    children: children,
                  }
              end

            parse(tokens, {:default, depth}, [], [node | doc])
        end

      {:error, _} = err ->
        err
    end
  end

  defp parse([{:annotation, _} = annotation | tokens], {:node, _} = state, {name, node_annotations, attrs}, doc) do
    attrs = [annotation | attrs]
    parse(tokens, state, {name, node_annotations, attrs}, doc)
  end

  defp parse([token | tokens], {:node, _} = state, {name, node_annotations, attrs}, doc) do
    case token_to_value(token) do
      {:ok, %Value{} = key} ->
        {key_annotations, attrs} =
          case attrs do
            [{:annotation, annotation} | attrs] ->
              {[annotation], attrs}

            attrs ->
              {[], attrs}
          end

        key = %{key | annotations: key.annotations ++ key_annotations}

        case trim_leading_space(tokens) do
          [{:=, _} | tokens] ->
            tokens = trim_leading_space(tokens)
            {value_annotations, tokens} =
              case tokens do
                [{:annotation, annotation} | tokens] ->
                  {[annotation], tokens}

                tokens ->
                  {[], tokens}
              end

            [token | tokens] = tokens
            case token_to_value(token) do
              {:ok, %Value{} = value} ->
                value = %{value | annotations: value.annotations ++ value_annotations}
                parse(tokens, state, {name, node_annotations, [{key, value} | attrs]}, doc)

              {:error, _} = err ->
                err
            end

          tokens ->
            case key do
              %{type: :id} ->
                {:error, {:bare_identifier, key}}

              _ ->
                parse(tokens, state, {name, node_annotations, [key | attrs]}, doc)
            end
        end

      {:error, _} = err ->
        err
    end
  end

  defp parse([], {:node, depth}, {name, node_annotations, attrs}, doc) do
    node = %Node{
      name: name,
      annotations: node_annotations,
      attributes: resolve_node_attributes(attrs),
      children: nil,
    }
    parse([], {:default, depth}, [], [node | doc])
  end

  defp parse([{:close_block, _} | _tokens] = tokens, {:default, _depth}, [], doc) do
    handle_parse_exit(tokens, doc)
  end

  defp extract_annotations(items, acc \\ [])

  defp extract_annotations([], acc) do
    Enum.reverse(acc)
  end

  defp extract_annotations([{:annotation, value} | rest], acc) do
    extract_annotations(rest, [value | acc])
  end

  defp extract_annotations([_ | rest], acc) do
    extract_annotations(rest, acc)
  end

  defp handle_parse_exit(rest, doc) do
    doc = Enum.reverse(doc)

    {:ok, handle_slashdashes(doc, []), rest}
  end

  defp resolve_node_attributes(acc) do
    acc
    |> Enum.reverse()
    |> handle_slashdashes([])
    |> Enum.reduce([], fn
      {key, value}, acc ->
        # deduplicate attributes
        acc =
          Enum.reject(acc, fn
            {key2, _value} -> key2.value == key.value
            _ -> false
          end)

        [{key, value} | acc]

      value, acc ->
        [value | acc]
    end)
    |> Enum.reverse()
  end

  defp fold_leading_tokens([{:space, _} | tokens]) do
    fold_leading_tokens(tokens)
  end

  defp fold_leading_tokens([{:nl, _} | tokens]) do
    fold_leading_tokens(tokens)
  end

  defp fold_leading_tokens(tokens) do
    tokens
  end

  defp trim_leading_space([{:space, _} | tokens]) do
    trim_leading_space(tokens)
  end

  defp trim_leading_space(tokens) do
    tokens
  end

  defp token_to_value({:term, value}) do
    decode_term(value)
  end

  defp token_to_value({:dquote_string, value}) do
    {:ok, %Value{value: value, type: :string}}
  end

  defp token_to_value({:raw_string, value}) do
    {:ok, %Value{value: value, type: :string}}
  end

  defp decode_term("true") do
    {:ok, %Value{value: true, type: :boolean}}
  end

  defp decode_term("false") do
    {:ok, %Value{value: false, type: :boolean}}
  end

  defp decode_term("null") do
    {:ok, %Value{type: :null, value: nil}}
  end

  defp decode_term(<<"0b", rest::binary>>) do
    decode_bin_integer(rest)
  end

  defp decode_term(<<"0o", rest::binary>>) do
    decode_oct_integer(rest)
  end

  defp decode_term(<<"0x", rest::binary>>) do
    decode_hex_integer(rest)
  end

  defp decode_term("") do
    {:error, :no_term}
  end

  defp decode_term(term) do
    case decode_dec_integer(term) do
      {:ok, value} ->
        {:ok, value}

      {:error, _} ->
        case decode_float(term) do
          {:ok, value} ->
            {:ok, value}

          {:error, _} ->
            {:ok, %Value{value: term, type: :id}}
        end
    end
  end

  defp decode_bin_integer(bin, state \\ :start, acc \\ [])

  defp decode_bin_integer(<<>>, :start, _acc) do
    {:error, :invalid_bin_integer_format}
  end

  defp decode_bin_integer(<<"_", rest::binary>>, :body, acc) do
    decode_bin_integer(rest, :body, acc)
  end

  defp decode_bin_integer(<<c::utf8, rest::binary>>, _, acc) when c in [?0, ?1] do
    decode_bin_integer(rest, :body, [<<c::utf8>> | acc])
  end

  defp decode_bin_integer(<<_::utf8, _rest::binary>>, _, _acc) do
    {:error, :invalid_bin_integer_format}
  end

  defp decode_bin_integer(<<>>, :body, acc) do
    case decode_integer(acc, 2) do
      {:ok, value} ->
        {:ok, %{value | format: :bin}}

      {:error, _} = err ->
        err
    end
  end

  defp decode_oct_integer(bin, state \\ :start, acc \\ [])

  defp decode_oct_integer(<<>>, :start, _acc) do
    {:error, :invalid_oct_integer_format}
  end

  defp decode_oct_integer(<<"_", rest::binary>>, :body, acc) do
    decode_oct_integer(rest, :body, acc)
  end

  defp decode_oct_integer(<<c::utf8, rest::binary>>, _, acc) when c in ?0..?7 do
    decode_oct_integer(rest, :body, [<<c::utf8>> | acc])
  end

  defp decode_oct_integer(<<_::utf8, _rest::binary>>, _, _acc) do
    {:error, :invalid_oct_integer_format}
  end

  defp decode_oct_integer(<<>>, :body, acc) do
    case decode_integer(acc, 8) do
      {:ok, value} ->
        {:ok, %{value | format: :oct}}

      {:error, _} = err ->
        err
    end
  end

  defp decode_dec_integer(bin, state \\ :start, acc \\ [])

  defp decode_dec_integer(<<>>, :start, _acc) do
    {:error, :invalid_dec_integer_format}
  end

  defp decode_dec_integer(<<"_", rest::binary>>, :body, acc) do
    decode_dec_integer(rest, :body, acc)
  end

  defp decode_dec_integer(<<c::utf8, rest::binary>>, :start, acc) when c in [?+, ?-] do
    decode_dec_integer(rest, :start, [<<c::utf8>> | acc])
  end

  defp decode_dec_integer(<<c::utf8, rest::binary>>, _, acc) when c in ?0..?9 do
    decode_dec_integer(rest, :body, [<<c::utf8>> | acc])
  end

  defp decode_dec_integer(<<_::utf8, _rest::binary>>, _, _acc) do
    {:error, :invalid_dec_integer_format}
  end

  defp decode_dec_integer(<<>>, :body, acc) do
    case decode_integer(acc, 10) do
      {:ok, value} ->
        {:ok, %{value | format: :dec}}

      {:error, _} = err ->
        err
    end
  end

  defp decode_hex_integer(bin, state \\ :start, acc \\ [])

  defp decode_hex_integer(<<>>, :start, _acc) do
    {:error, :invalid_hex_integer_format}
  end

  defp decode_hex_integer(<<"_", rest::binary>>, :body, acc) do
    decode_hex_integer(rest, :body, acc)
  end

  defp decode_hex_integer(<<c::utf8, rest::binary>>, _, acc) when c in ?0..?9 or
                                                                  c in ?A..?F or
                                                                  c in ?a..?f do
    decode_hex_integer(rest, :body, [<<c::utf8>> | acc])
  end

  defp decode_hex_integer(<<_::utf8, _rest::binary>>, _, _acc) do
    {:error, :invalid_hex_integer_format}
  end

  defp decode_hex_integer(<<>>, :body, acc) do
    case decode_integer(acc, 16) do
      {:ok, value} ->
        {:ok, %{value | format: :hex}}

      {:error, _} = err ->
        err
    end
  end

  defp decode_integer(acc, radix) do
    case Integer.parse(IO.iodata_to_binary(Enum.reverse(acc)), radix) do
      {int, ""} ->
        {:ok, %Value{value: int, type: :integer}}

      {_int, _} ->
        {:error, :invalid_integer_format}

      :error ->
        {:error, :invalid_integer_format}
    end
  end

  defp decode_float(value) do
    case parse_float_string(value) do
      {:ok, value} ->
        case Decimal.parse(value) do
          {:ok, %Decimal{} = decimal} ->
            {:ok, %Value{value: decimal, type: :float}}

          {%Decimal{} = decimal, ""} ->
            {:ok, %Value{value: decimal, type: :float}}

          {%Decimal{}, _} ->
            {:error, :invalid_float_format}

          :error ->
            {:error, :invalid_float_format}
        end

      {:error, _} = err ->
        err
    end
  end

  defp handle_slashdashes([:slashdash, _term | tokens], acc) do
    handle_slashdashes(tokens, acc)
  end

  defp handle_slashdashes([:slashdash], acc) do
    handle_slashdashes([], acc)
  end

  defp handle_slashdashes([term | tokens], acc) do
    handle_slashdashes(tokens, [term | acc])
  end

  defp handle_slashdashes([], acc) do
    Enum.reverse(acc)
  end
end