lib/parser.ex

defmodule Xpeg.Parser do
  @moduledoc false

  @inline_max_len 30

  # Emit a choice/commit pair around pattern p; off_back and off_commit are the
  # offsets to the backtrack and commit targets, relative to the commit
  # instruction
  defp choice_commit(p, off_commit, off_back) do
    [{:choice, off_back, off_commit}] ++ p ++ [{:commit}]
  end

  # Generic ordered choice
  defp mk_choice(p1, p2) do
    choice_commit(p1, length(p1) + length(p2) + 2, length(p1) + 2) ++ p2
  end

  # kleene-star operator
  defp mk_star(p) do
    case p do
      [{:set, cs}] -> [{:span, cs}]
      _ -> choice_commit(p, 0, length(p) + 2)
    end
  end

  # Generic ! 'not' predicate
  defp mk_not(p) do
    choice_commit(p, length(p) + 2, length(p) + 3) ++ [{:fail}]
  end

  # Generic optional
  defp mk_opt(p) do
    choice_commit(p, length(p) + 2, length(p) + 2)
  end

  # minus, !p2 * p1, optimized for :set
  defp mk_minus(p1, p2) do
    case {p1, p2} do
      {[{:set, cs1}], [{:set, cs2}]} -> [{:set, cs1 -- cs2}]
      {[{:set, cs1}], [{:chr, c2}]} -> [{:set, cs1 -- [c2]}]
      {_, _} -> mk_not(p2) ++ p1
    end
  end

  # Charset
  defp mk_set(ps) do
    cs =
      Enum.reduce(ps, [], fn p, set ->
        case p do
          [v] ->
            [v | set]

          {:.., _, [lo, hi]} ->
            # Recursively parse lo and hi to handle nested sigils
            lo_chars = mk_set([lo]) |> Enum.flat_map(fn {:set, chars} -> chars end)
            hi_chars = mk_set([hi]) |> Enum.flat_map(fn {:set, chars} -> chars end)
            Enum.uniq(Enum.to_list(Enum.min(lo_chars)..Enum.max(hi_chars)) ++ set)

          {:sigil_c, _, [{:<<>>, _, [chars]}, _]} when is_binary(chars) ->
            # Handle ~c sigil format where chars is wrapped in a tuple
            chars_list = to_charlist(chars)
            Enum.uniq(chars_list ++ set)

          {:sigil_c, _, [chars, _]} when is_binary(chars) ->
            # Handle ~c sigil format where chars is directly a binary
            chars_list = to_charlist(chars)
            Enum.uniq(chars_list ++ set)

          v when is_binary(v) ->
            # Handle string literals directly
            chars_list = to_charlist(v)
            Enum.uniq(chars_list ++ set)

          v when is_list(v) and length(v) == 1 ->
            # Handle single character charlist
            [hd(v) | set]
        end
      end)

    [{:set, cs}]
  end

  # Small rules that are already in the grammar get inlined instead of
  # called
  def call_or_inline(grammar, v) do
    if grammar[v] != nil and Enum.count(grammar[v]) < @inline_max_len do
      grammar[v]
    else
      [{:call, v}]
    end
  end

  def parse(node) do
    parse(%{}, node)
  end

  # Parse a pattern
  def parse(grammar, node) do
    # IO.inspect (node)

    case node do
      # Parse a grammar consisting of a list of named rules
      {:__block__, _, ps} ->
        Enum.reduce(ps, grammar, fn rule, grammar -> parse(grammar, rule) end)

      # Parse one named rule
      {:<-, _, [name, patt]} ->
        Map.put(grammar, Xpeg.unalias(name), parse(grammar, patt))

      # infix: '*' Concatenation
      {:*, _, [p1, p2]} ->
        parse(grammar, p1) ++ parse(grammar, p2)

      # infix '|': Ordered choice
      {:|, _, [p1, p2]} ->
        mk_choice(parse(grammar, p1), parse(grammar, p2))

      # prefix '*': zero-or-more operator
      {:star, _, [p]} ->
        mk_star(parse(grammar, p))

      # prefix '?': one-or-zero operator
      {:opt, _, [p]} ->
        mk_opt(parse(grammar, p))

      # prefix '+': one-or-more operator
      {:+, _, [p]} ->
        p = parse(grammar, p)
        p ++ mk_star(p)

      # Infix '-': difference
      {:-, _, [p1, p2]} ->
        mk_minus(parse(grammar, p1), parse(grammar, p2))

      # prefix '!': 'not' operator
      {:!, _, [p]} ->
        mk_not(parse(grammar, p))

      # prefix '&': 'and-predicate' operator
      {:&, _, [p]} ->
        mk_not(mk_not(parse(grammar, p)))

      # prefix '@': 'search' operator, *(1 - P) * P
      {:@, _, [p]} ->
        p = parse(grammar, p)
        mk_star(mk_minus({:any, 1}, p)) ++ p

      # Charset
      {:{}, _, ps} ->
        # Preprocess each element in the set to handle string literals
        processed_ps =
          Enum.map(ps, fn
            {:<<>>, _meta, [string]} when is_binary(string) -> string
            other -> other
          end)

        mk_set(processed_ps)

      # Repetition count [low..hi]
      {{:., _, [Access, :get]}, _, [p, {:.., _, [n1, n2]}]} ->
        p = parse(grammar, p)
        (List.duplicate(p, n1) ++ List.duplicate(mk_opt(p), n2 - n1)) |> List.flatten()

      # Repetition count [n]
      {{:., _, [Access, :get]}, _, [p, n]} ->
        List.duplicate(parse(grammar, p), n) |> List.flatten()

      # Capture
      {captype, _, [p]} when captype in [:str, :int, :float] ->
        [{:capopen}] ++ parse(grammar, p) ++ [{:capclose, captype}]

      # Code block
      {:fn, meta, [code]} ->
        [{:code, {:fn, meta, [code]}}]

      # Aliased atoms, for Capital names instead of :colon names
      {:__aliases__, _, [id]} ->
        parse(grammar, id)

      # Delegate two-tuple :{} set to the above parse function
      {p1, p2} ->
        parse(grammar, {:{}, 0, [p1, p2]})

      # Label: call or inline a named rule
      v when is_atom(v) ->
        call_or_inline(grammar, v)

      # Number 0, :any operator with 0 count, matches always
      0 ->
        [{:nop}]

      # Number, :any operator with non-zero count
      v when is_number(v) ->
        [{:any, v}]

      # String literal
      v when is_binary(v) ->
        parse(grammar, to_charlist(v))

      # Charlist
      v when is_list(v) ->
        for c <- v do
          {:chr, c}
        end

      # Add support for string literals in AST format
      {:<<>>, _meta, [string]} when is_binary(string) ->
        {:chr, to_charlist(string)}

      # Add support for sigil format
      {:sigil_c, _meta, [chars, _]} ->
        for c <- to_charlist(chars) do
          {:chr, c}
        end

      {_, meta, e} ->
        raise(
          "XPeg: #{inspect(meta)}: Syntax error at '#{Macro.to_string(e)}' \n\n   #{inspect(e)}\n"
        )

      e ->
        raise("XPeg: unhandled literal #{inspect(e)}")
    end
  end
end