lib/spf/lexer.ex

defmodule Spf.Lexer do
  @moduledoc """
  Lexer for SPF strings and explain-strings.

  See [the collected ABNF](https://www.rfc-editor.org/rfc/rfc7208.html#section-12).
  """

  @typedoc """
  qualifier = ?+ / ?- / ?~ / ??

  """
  @type q :: ?+ | ?- | ?~ | ??

  @typedoc """
  `range` denotes a token's `start..stop`-slice in the input string.

  """
  @type range :: Range.t()

  @typedoc """
  The token's `type`.

  There are several classes of tokens:
  - the version: `:version`,
  - a mechanism: `:a, :all, :exists, :exp, :include, :ip4, :ip6, :mx, :ptr`
  - a modifier: `:exp, :redirect`,
  - an explain string: `:exp_str`,
  - an unknown modifier: `:unknown`,
  - a syntax error: `:error`
  - whitespace: `:whitespace`,
  - a subtoken: `:expand, :literal, :cidr`

  Subtokens may appear as part of another token's value.

  `:exp_str` is produced by tokenizing the explain string.  After expanding the
  domain-spec of modifier `exp=domain-spec` into a domain name, that domain's
  TXT RR is retrieved and tokenized for later expansion into an explanation
  string.  This only happens when the SPF verdict is `:fail` and the `exp`
  modifier is present and has a valid domain name.

  The `:whitespace` token will match both spaces and tab characters in order to
  be able to warn about multiple spaces and/or tab characters being used.  Use
  of a tab character is technically a syntax error, but this library only warns
  about its use.

  The `:error` token is tried as a last resort and matches any non-space
  sequence.  When matched, it means the SPF string has a syntax error.

  """
  @type type ::
          :a
          | :all
          | :exists
          | :exp
          | :include
          | :ip4
          | :ip6
          | :mx
          | :ptr
          | :redirect
          | :unknown
          | :version
          | :whitespace
          # catch all
          | :error
          # explain-string
          | :exp_str
          # subtokens
          | :cidr
          | :expand
          | :literal

  @typedoc """
  A token represented as a tuple: `{type, list, range}`.

  Where:
  - `type` is an atom which denotes the token `t:type/0`
  - `list` may be empty or contain one or more values (including subtokens)
  - `range` is the `start..stop`-slice in the input string

  """
  @type token :: {type, list(), range}

  @typedoc """
  An ok/error tuple produced by lexing some input
  """
  @type result :: {:ok, [token], binary, map} | {:error, atom, binary, map}

  @typedoc """
  A lexer is a function that takes a binary & a lexer-context, and returns a `t:result/0`
  """
  @type lexer :: (binary, map -> result)

  @wspace [?\s, ?\t]
  @eoterm @wspace ++ [-1]
  @mdelimiters [?., ?-, ?+, ?,, ?/, ?_, ?=]
  @mletters [?s, ?l, ?o, ?d, ?i, ?p, ?h, ?c, ?r, ?t, ?v] ++
              [?S, ?L, ?O, ?D, ?I, ?P, ?H, ?C, ?R, ?T, ?V]

  # when used to slice a string -> yields ""
  @null_slice 1..0//-1

  @doc """
  Returns a lexer `t:result/0` after consuming an SPF string.

  The SPF string can be a full SPF TXT string or a partial string.
  The lexer produces a list of tokens found, including a catch-all
  `:error` token for character sequences that were not recognized.

  ## Example

      iex> {:ok, tokens, _rest, _map} = Spf.Lexer.tokenize_spf("a:%{d}")
      iex> tokens
      [{:a, [43, {:expand, [100, -1, false, ["."]], 2..5}, {:cidr, [32, 128], 1..0//-1}], 0..5}]

  """
  @spec tokenize_spf(binary) :: result
  def tokenize_spf(input) when is_binary(input),
    do: spf_tokenize().(input, %{offset: 0, input: input})

  @doc """
  Returns a lexer `t:result/0` after consuming an explain-string.

  An explaing-string is the TXT RR value of the domain specified by the
  domain specification of the `exp`-modifier and is basically a series
  of macro-strings and spaces.  This is the only time `c`, `r`, `t`-macros
  may be used.

  The lexer produces an `:error` token for character-sequences it doesn't know.

  ## Example

      iex> {:ok, tokens, _rest, _map} = Spf.Lexer.tokenize_exp("timestamp %{t}")
      iex> tokens
      [
        {:exp_str,
         [{:literal, ["timestamp"], 0..8},
          {:whitespace, [" "], 9..9},
          {:expand, [116, -1, false, ["."]], 10..13}
         ], 0..13}
      ]

  """
  @spec tokenize_exp(binary) :: result
  def tokenize_exp(input) when is_binary(input),
    do: exp_tokenize().(input, %{offset: 0, input: input})

  # Context Helpers

  @spec del(map, atom) :: map
  defp del(map, name),
    do: Map.delete(map, name)

  @spec range(map, atom) :: range
  defp range(map, name) do
    # note: if stop < start -> a token is being generated out of nothing
    start = Map.get(map, name, 0)
    stop = Map.get(map, :offset) - 1

    cond do
      start > stop -> @null_slice
      true -> start..stop
    end
  end

  @spec set(map, atom) :: map
  defp set(map, name) do
    # record current offset for given `name`
    Map.put(map, name, map.offset)
  end

  @spec upd(map, binary) :: map
  defp upd(map, rest) do
    # update offset after some input was accepted by some parser
    offset = map.offset + String.length(map.input) - String.length(rest)

    Map.put(map, :offset, offset)
    |> Map.put(:input, rest)
  end

  # Tokenizers

  @spec spf_tokenize :: lexer
  defp spf_tokenize() do
    choice([
      whitespace(),
      # mechanisms
      mechanism(:a),
      mechanism(:mx),
      mechanism(:ip4),
      mechanism(:ip6),
      mechanism(:include),
      mechanism(:exists),
      mechanism(:ptr),
      mechanism(:all),
      # modifiers
      modifier(:redirect),
      modifier(:v),
      modifier(:exp),
      modifier(:unknown),
      # catch all
      error()
    ])
    |> many()
  end

  @spec exp_tokenize :: lexer
  defp exp_tokenize() do
    choice([whitespace(), mstring() |> satisfy(fn l -> l != [] end), error()])
    |> many()
    |> map(fn l -> List.flatten(l) end)
    |> mark(:exp_str)
    |> map(fn token -> [token] end)
  end

  # Token parsers

  @spec error :: lexer
  defp error() do
    until(fn c -> c in @eoterm end)
    |> map(fn chars -> [to_string(chars)] end)
    |> mark(:error)
  end

  @spec mechanism(atom) :: lexer
  defp mechanism(key) when key in [:a, :mx] do
    choice([
      sequence([qualifier(), keyword(key, eot())])
      |> map(fn [q, _key] -> [q, default_cidr([])] end),
      sequence([qualifier(), keyword(key, char1(?/)), cidr()])
      |> map(fn [q, _key, cidr] -> [q, cidr] end),
      sequence([
        qualifier(),
        keyword(key, char1(?:)),
        char1(?:),
        choice([expand(), literals(), merror()])
        |> until(eot())
        |> satisfy(fn x -> x != [] end)
      ])
      |> map(fn [q, _key, _skip, terms] -> cidr_check(key, [q | terms]) end)
    ])
    |> mark(key)
  end

  defp mechanism(:ptr) do
    choice([
      sequence([qualifier(), keyword(:ptr, eot())])
      |> map(fn [q, _key] -> [q] end),
      sequence([
        qualifier(),
        keyword(:ptr, char1(?:)),
        char1(?:),
        choice([expand(), literals(), merror()])
        |> until(eot())
        |> satisfy(fn x -> x != [] end)
      ])
      |> map(fn [q, _key, _skip, terms] -> [q | terms] end)
    ])
    |> mark(:ptr)
  end

  defp mechanism(:all) do
    sequence([qualifier(), keyword(:all, eot())])
    |> map(fn [q, _key] -> [q] end)
    |> mark(:all)
  end

  defp mechanism(key) do
    # mechanisms :ip4, :ip6, :include, :exists
    sequence([
      qualifier(),
      keyword(key, char1(?:)),
      char1(?:),
      choice([expand(), literals(), merror()])
      |> until(eot())
      |> satisfy(fn x -> x != [] end)
    ])
    |> map(fn [q, _key, _skip, terms] -> cidr_check(key, [q | terms]) end)
    |> mark(key)
  end

  @spec modifier(atom) :: lexer
  defp modifier(:v) do
    sequence([
      keyword(:v, char1(?=)),
      char1(?=),
      keyword(:spf, number()),
      number(),
      eot()
    ])
    |> map(fn [_v, _is, _spf, n, _eot] -> [n] end)
    |> mark(:version)
  end

  defp modifier(:unknown) do
    # name = *( expand / literal )
    sequence([name(), char1(?=), mstring()])
    |> map(fn [name, _, macro_string] -> List.flatten([name, macro_string]) end)
    |> mark(:unknown)
  end

  defp modifier(key) do
    sequence([
      keyword(key, char1(?=)),
      char1(?=),
      choice([expand(), literals(), merror()])
      |> until(eot())
      |> map(fn l -> if l == [], do: [{:error, [""], @null_slice}], else: l end)
      # |> satisfy(fn x -> x != [] end)
    ])
    |> map(fn [_key, _skip, terms] -> cidr_check(key, terms) end)
    |> mark(key)
  end

  @spec whitespace() :: lexer
  defp whitespace() do
    until(fn c -> c not in @wspace end)
    |> map(fn chars -> [to_string(chars)] end)
    |> mark(:whitespace)
  end

  # TokenParser Helpers

  @spec cidr_check(atom, [token]) :: [token]
  defp cidr_check(key, tokens) when key in [:a, :mx] do
    with {:literal, [str], range} <- List.last(tokens),
         context <- %{offset: range.first, input: str},
         {:ok, [literal, cidr], "", _} <- cidr_lex().(str, context) do
      case literal do
        {:literal, [""], @null_slice} -> List.replace_at(tokens, -1, cidr)
        _ -> List.replace_at(tokens, -1, literal) |> List.insert_at(-1, cidr)
      end
    else
      _ -> List.insert_at(tokens, -1, {:cidr, [32, 128], @null_slice})
    end
  end

  defp cidr_check(_key, tokens),
    do: tokens

  @spec cidr_lex() :: lexer
  defp cidr_lex() do
    # parse the last cidr from a literal string and return [lead_literal, cidr_or_default]
    sequence([
      char()
      |> until(choice([cidr(), eot()]))
      |> map(fn l -> [to_string(l)] end)
      |> mark(:literal),
      optional(cidr()) |> map(&default_cidr/1)
    ])
  end

  @spec default_cidr(list) :: list
  defp default_cidr(l),
    do: if(l == [], do: {:cidr, [32, 128], @null_slice}, else: l)

  @spec default_mdelims(list) :: list
  defp default_mdelims(l),
    do: if(l == [], do: ["."], else: Enum.map(l, fn n -> to_string([n]) end))

  defp expand() do
    # note: keep param defaults to -1, since keep==0 is actually valid (albeit useless)
    choice([
      sequence([
        keyword("%{"),
        any(@mletters),
        optional(number()) |> map(fn x -> if x == [], do: -1, else: x end),
        optional(keyword("r")) |> map(fn x -> if x == [], do: false, else: true end),
        optional(any(@mdelimiters) |> many()) |> map(&default_mdelims/1),
        keyword("}")
      ])
      |> map(fn [_, ltr, keep, reverse, delims, _] -> [ltr, keep, reverse, delims] end)
      |> mark(:expand),
      keyword("%%") |> map(fn _ -> ["%"] end) |> mark(:expand),
      keyword("%-") |> map(fn _ -> ["-"] end) |> mark(:expand),
      keyword("%_") |> map(fn _ -> ["_"] end) |> mark(:expand)
    ])
  end

  defp mliteral?(c),
    do: 0x21 <= c and c <= 0x7E and c != 0x25

  defp mstring(),
    do: choice([expand(), literals(), merror()]) |> until(eot())

  defp merror() do
    char()
    |> satisfy(fn _ -> true end)
    |> map(fn l -> [to_string([l])] end)
    |> mark(:error)
  end

  defp literals() do
    until(fn c -> not mliteral?(c) end)
    |> map(fn l -> [List.flatten(l) |> to_string()] end)
    |> mark(:literal)
  end

  # SPF parsers
  # token = {:type, list, range}
  # subtokens may have null_slice as range (i.e. they're defaults, not present in input)

  defp cidr() do
    # cidr can only match if it ends a term
    choice([
      sequence([cidr4(), cidr6()]),
      cidr4() |> map(fn n -> [n, 128] end),
      cidr6() |> map(fn n -> [32, n] end)
    ])
    |> eot()
    |> mark(:cidr)
  end

  defp cidr4() do
    # just lex the number, parser will check validity (incl. leading zeros)
    sequence([char1(?/), number()])
    |> map(fn [_, number] -> number end)
  end

  defp cidr6() do
    # just lex the number, parser will check validity
    sequence([char1(?/), char1(?/), number()])
    |> map(fn [_, _, number] -> number end)
  end

  defp qualifier() do
    # when used always yields a qualifier
    any([?+, ?-, ?~, ??])
    |> optional()
    |> map(fn x -> if x == [], do: ?+, else: x end)
  end

  # GENERIC Parsers > return parser results

  defp any(codepoints),
    do: char() |> satisfy(fn c -> c in codepoints end)

  defp alpha(),
    do: char() |> satisfy(fn x -> (?a <= x and x <= ?z) or (?A <= x and x <= ?Z) end)

  defp alnum(),
    do: choice([alpha(), digit()])

  defp char1(expected),
    do: char() |> satisfy(fn x -> x == expected end)

  defp digit(),
    do: char() |> satisfy(fn x -> ?0 <= x and x <= ?9 end)

  defp keyword(expected) do
    to_string(expected)
    |> anycase(empty())
    |> map(fn _ -> expected end)
  end

  defp keyword(expected, accept) do
    to_string(expected)
    |> anycase(accept)
    |> map(fn _ -> expected end)
  end

  defp name() do
    sequence([alpha(), choice([alnum(), any([?., ?_, ?-])]) |> many()])
    |> satisfy(fn list -> list != [] end)
    |> map(fn chars -> to_string(chars) end)
  end

  defp number() do
    sequence([digit(), digit() |> many()])
    |> map(fn digits -> List.flatten(digits) |> List.to_integer() end)
  end

  # COMBINATORS > return a parser function
  # @spec parser_fun: (binary, map) -> {:error, reason, input, ctx} | {:ok, terms, rest, upd(ctx, rest)}
  defp anycase(str, accept) do
    fn input, ctx ->
      want = String.upcase(str)
      {have, rest} = String.split_at(input, String.length(want))

      with {:ok, _, _, _} <- accept.(rest, ctx) do
        if want == String.upcase(have),
          do: {:ok, str, rest, upd(ctx, rest)},
          else: {:error, :anycase, input, ctx}
      end
    end
  end

  defp empty(),
    do: fn input, ctx -> {:ok, [], input, ctx} end

  defp char() do
    fn input, ctx ->
      case input do
        "" -> {:error, :eos, input, ctx}
        <<byte::8, rest::binary>> -> {:ok, byte, rest, ctx}
      end
    end
  end

  defp choice(parsers) do
    fn input, ctx ->
      case parsers do
        [] ->
          {:error, :choice, input, ctx}

        [first | others] ->
          with {:error, _, _, ctx} <- first.(input, ctx),
               do: choice(others).(input, ctx)
      end
    end
  end

  defp eot() do
    # end of term means wspace is next or end of input
    fn input, ctx ->
      c =
        case input do
          "" -> -1
          <<c::8, _rest::binary>> -> c
        end

      if c in @eoterm,
        do: {:ok, [], input, ctx},
        else: {:error, :eot, input, ctx}
    end
  end

  defp eot(parser) do
    # usage: parser() |> eot()
    fn input, ctx ->
      with {:ok, term, rest, ctx} <- parser.(input, ctx),
           {:ok, _, _, _} <- eot().(rest, ctx),
           do: {:ok, term, rest, upd(ctx, rest)}
    end
  end

  defp many(parser) do
    # note, applies `parser` 0 or more times:
    # - if `parser` *never* fails, this will loop forever!
    # - if having one many inside another many, it'll loop forever as well
    fn input, ctx ->
      case parser.(input, ctx) do
        {:error, _, _, ctx} ->
          {:ok, [], input, ctx}

        {:ok, term, rest, ctx} ->
          {:ok, terms, rest, ctx} = many(parser).(rest, ctx)
          {:ok, [term | terms], rest, upd(ctx, rest)}
      end
    end
  end

  defp mark(parser, name) do
    fn input, ctx ->
      with {:ok, term, rest, ctx} <- parser.(input, set(ctx, name)) do
        {:ok, {name, term, range(ctx, name)}, rest, del(ctx, name)}
      end
    end
  end

  defp map(parser, mapper) do
    fn input, ctx ->
      with {:ok, term, rest, ctx} <- parser.(input, ctx),
           do: {:ok, mapper.(term), rest, ctx}
    end
  end

  defp satisfy(parser, accept) do
    fn input, ctx ->
      with {:ok, term, rest, ctx} <- parser.(input, ctx) do
        if accept.(term),
          do: {:ok, term, rest, upd(ctx, rest)},
          else: {:error, :reject, rest, ctx}
      end
    end
  end

  defp optional(parser) do
    fn input, ctx ->
      with {:error, _, _, ctx} <- parser.(input, ctx),
           do: {:ok, [], input, ctx}
    end
  end

  defp sequence(parsers) do
    fn input, ctx ->
      case parsers do
        [] ->
          {:ok, [], input, ctx}

        [first | others] ->
          with {:ok, term, rest, ctx_s} <- first.(input, ctx),
               {:ok, terms, rest, ctx_s} <- sequence(others).(rest, ctx_s) do
            {:ok, [term | terms], rest, ctx_s}
          else
            _ -> {:error, :sequence, input, ctx}
          end
      end
    end
  end

  defp until(stop) when is_function(stop, 1) do
    # gobble up chars until stop function says so or till eos
    fn input, ctx ->
      {char, rest} =
        case input do
          "" -> {-1, ""}
          <<c::8, rest::binary>> -> {c, rest}
        end

      if stop.(char) do
        if input == ctx.input,
          do: {:error, :until, input, ctx},
          else: {:ok, [], input, ctx}
      else
        with {:ok, chars, rest, ctx} <- until(stop).(rest, ctx),
             do: {:ok, [char | chars], rest, upd(ctx, rest)}
      end
    end
  end

  defp until(parser, stop) when is_function(stop, 2) do
    # apply parser 0 or more times, until stop parser says so or fail
    # - note: if parser fails before stop says to stop, nothing is parsed
    fn input, ctx ->
      case stop.(input, ctx) do
        {:ok, _, _, _} ->
          {:ok, [], input, ctx}

        _ ->
          with {:ok, term, rest, ctx} <- parser.(input, ctx),
               {:ok, terms, rest, ctx} <- until(parser, stop).(rest, ctx) do
            {:ok, [term | terms], rest, upd(ctx, rest)}
          end
      end
    end
  end
end