lib/mail/parsers/rfc_2822.ex

defmodule Mail.Parsers.RFC2822 do
  @moduledoc """
  RFC2822 Parser

  Will attempt to parse a valid RFC2822 message back into
  a `%Mail.Message{}` data model.

      Mail.Parsers.RFC2822.parse(message)
      %Mail.Message{body: "Some message", headers: %{to: ["user@example.com"], from: "other@example.com", subject: "Read this!"}}
  """

  @months ~w(jan feb mar apr may jun jul aug sep oct nov dec)

  @spec parse(binary() | nonempty_maybe_improper_list()) :: Mail.Message.t()
  def parse(content)

  def parse([_ | _] = lines) do
    [headers, lines] = extract_headers(lines)

    %Mail.Message{}
    |> parse_headers(headers)
    |> mark_multipart
    |> parse_body(lines)
  end

  def parse(content),
    do: content |> String.split("\r\n") |> Enum.map(&String.trim_trailing/1) |> parse

  defp extract_headers(list, headers \\ [])

  defp extract_headers(["" | tail], headers),
    do: [Enum.reverse(headers), tail]

  defp extract_headers([<<" ", _::binary>> = folded_body | tail], [previous_header | headers]),
    do: extract_headers(tail, [previous_header <> folded_body | headers])

  defp extract_headers([<<"\t", _::binary>> = folded_body | tail], [previous_header | headers]),
    do: extract_headers(tail, [previous_header <> folded_body | headers])

  defp extract_headers([header | tail], headers),
    do: extract_headers(tail, [header | headers])

  @doc """
  Parses a RFC2822 timestamp to a DateTime with timezone

  [RFC2822 3.3 - Date and Time Specification](https://tools.ietf.org/html/rfc2822#section-3.3)
  """
  def to_datetime(<<" ", rest::binary>>), do: to_datetime(rest)
  def to_datetime(<<"\t", rest::binary>>), do: to_datetime(rest)
  def to_datetime(<<_day::binary-size(3), ", ", rest::binary>>), do: to_datetime(rest)

  def to_datetime(<<date::binary-size(1), " ", rest::binary>>),
    do: to_datetime("0" <> date <> " " <> rest)

  # This caters for an invalid date with no 0 before the hour, e.g. 5:21:43 instead of 05:21:43
  def to_datetime(<<date::binary-size(11), " ", hour::binary-size(1), ":", rest::binary>>) do
    to_datetime("#{date} 0#{hour}:#{rest}")
  end

  # This caters for an invalid date with dashes between the date/month/year parts
  def to_datetime(
        <<date::binary-size(2), "-", month::binary-size(3), "-", year::binary-size(4),
          rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year}#{rest}")
  end

  # This caters for an invalid two-digit year
  def to_datetime(
        <<date::binary-size(2), " ", month::binary-size(3), " ", year::binary-size(2), " ",
          rest::binary>>
      ) do
    year = year |> String.to_integer() |> to_four_digit_year()
    to_datetime("#{date} #{month} #{year} #{rest}")
  end

  # This caters for missing seconds
  def to_datetime(
        <<date::binary-size(11), " ", hour::binary-size(2), ":", minute::binary-size(2), " ",
          rest::binary>>
      ) do
    to_datetime("#{date} #{hour}:#{minute}:00 #{rest}")
  end

  # Fixes invalid value: Wed, 14 10 2015 12:34:17
  def to_datetime(
        <<date::binary-size(2), " ", month_digits::binary-size(2), " ", year::binary-size(4), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2),
          rest::binary>>
      ) do
    month_name = get_month_name(month_digits)
    to_datetime("#{date} #{month_name} #{year} #{hour}:#{minute}:#{second}#{rest}")
  end

  def to_datetime(
        <<date::binary-size(2), " ", month::binary-size(3), " ", year::binary-size(4), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), " ",
          time_zone::binary>>
      ) do
    year = year |> String.to_integer()
    month = get_month(String.downcase(month))
    date = date |> String.to_integer()

    hour = hour |> String.to_integer()
    minute = minute |> String.to_integer()
    second = second |> String.to_integer()

    time_zone = parse_time_zone(time_zone)

    date_string =
      "#{year}-#{date_pad(month)}-#{date_pad(date)}T#{date_pad(hour)}:#{date_pad(minute)}:#{date_pad(second)}#{time_zone}"

    {:ok, datetime, _offset} = DateTime.from_iso8601(date_string)
    datetime
  end

  # This adds support for a now obsolete format
  # https://tools.ietf.org/html/rfc2822#section-4.3
  def to_datetime(
        <<date::binary-size(2), " ", month::binary-size(3), " ", year::binary-size(4), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), " ",
          timezone::binary-size(3), _rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second} (#{timezone})")
  end

  # Fixes invalid value: Tue Aug 8 12:05:31 CAT 2017
  def to_datetime(
        <<_day::binary-size(3), " ", month::binary-size(3), " ", date::binary-size(2), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), " ",
          _tz::binary-size(3), " ", year::binary-size(4), _rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second}")
  end

  # Fixes invalid value with milliseconds Tue, 20 Jun 2017 09:44:58.568 +0000 (UTC)
  def to_datetime(
        <<date::binary-size(2), " ", month::binary-size(3), " ", year::binary-size(4), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), ".",
          _milliseconds::binary-size(3), rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second}#{rest}}")
  end

  # Fixes invalid value: Tue May 30 15:29:15 2017
  def to_datetime(
        <<_day::binary-size(3), " ", month::binary-size(3), " ", date::binary-size(2), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), " ",
          year::binary-size(4), _rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second} +0000")
  end

  # Fixes invalid value: Tue Aug 8 12:05:31 2017
  def to_datetime(
        <<_day::binary-size(3), " ", month::binary-size(3), " ", date::binary-size(1), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2), " ",
          year::binary-size(4), _rest::binary>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second} +0000")
  end

  # Fixes missing time zone
  def to_datetime(
        <<date::binary-size(2), " ", month::binary-size(3), " ", year::binary-size(4), " ",
          hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2)>>
      ) do
    to_datetime("#{date} #{month} #{year} #{hour}:#{minute}:#{second} +0000")
  end

  # # Fixes invalid value: Wed, 14 10 2015 12:34:17
  # def to_datetime(
  #       <<date::binary-size(2), " ", month_digits::binary-size(2), " ", year::binary-size(4), " ",
  #         hour::binary-size(2), ":", minute::binary-size(2), ":", second::binary-size(2),
  #         rest::binary>>
  #     ) do
  #   month_name = get_month_name(month_digits)
  #   to_datetime("#{date} #{month_name} #{year} #{hour}:#{minute}:#{second}#{rest}")
  # end

  defp to_four_digit_year(year) when year >= 0 and year < 50, do: 2000 + year
  defp to_four_digit_year(year) when year < 100 and year >= 50, do: 1900 + year

  defp date_pad(number) when number < 10, do: "0" <> Integer.to_string(number)
  defp date_pad(number), do: Integer.to_string(number)

  defp parse_time_zone(<<"(", time_zone::binary>>) do
    time_zone
    |> String.trim_trailing(")")
    |> parse_time_zone()
  end

  @months
  |> Enum.with_index(1)
  |> Enum.each(fn {month_name, month_number} ->
    defp get_month(unquote(month_name)), do: unquote(month_number)

    defp get_month_name(unquote(String.pad_leading(to_string(month_number), 2, "0"))),
      do: unquote(month_name)
  end)

  # Greenwich Mean Time
  defp parse_time_zone("GMT"), do: "+0000"
  # Universal Time
  defp parse_time_zone("UTC"), do: "+0000"
  defp parse_time_zone("UT"), do: "+0000"

  # US
  defp parse_time_zone("EDT"), do: "-0400"
  defp parse_time_zone("EST"), do: "-0500"
  defp parse_time_zone("CDT"), do: "-0500"
  defp parse_time_zone("CST"), do: "-0600"
  defp parse_time_zone("MDT"), do: "-0600"
  defp parse_time_zone("MST"), do: "-0700"
  defp parse_time_zone("PDT"), do: "-0700"
  defp parse_time_zone("PST"), do: "-0800"
  # Military A-Z
  defp parse_time_zone(<<_zone_letter::binary-size(1)>>), do: "-0000"

  defp parse_time_zone(<<"+", offset::binary-size(4), _rest::binary>>), do: "+#{offset}"
  defp parse_time_zone(<<"-", offset::binary-size(4), _rest::binary>>), do: "-#{offset}"

  defp parse_time_zone(time_zone) do
    time_zone
    |> String.trim_leading("(")
    |> String.trim_trailing(")")
  end

  @doc """
  Retrieves the "name" and "address" parts from an email message recipient
  (To, CC, etc.). The following is an example of recipient value:

      Full Name <fullname@company.tld>, another@company.tld

  In this example, `Full Name` is the "name" part and `fullname@company.tld` is
  the "address" part. `another@company.tld` does not have a "name" part, only
  an "address" part.

  The return value is a mixed list of tuples and strings, which should be
  interpreted in the following way:
  - When the element is just a string, it represents the "address" part only
  - When the element is a tuple, the format is `{name, address}`. Both "name"
    and "address" are strings
  """
  @spec parse_recipient_value(value :: String.t()) ::
          [{String.t(), String.t()} | String.t()]
  def parse_recipient_value(value) do
    Regex.scan(~r/\s*("?)(.*?)\1\s*?<?([^<\s]+@[^\s>,]+)>?,?/, value)
    |> Enum.map(fn
      [_, _, "", address] -> address
      [_, _, name, address] -> {name, address}
    end)
  end

  defp parse_headers(message, []), do: message

  defp parse_headers(message, [header | tail]) do
    [name, body] = String.split(header, ":", parts: 2)
    key = String.downcase(name)
    decoded = parse_encoded_word(body)
    headers = put_header(message.headers, key, parse_header_value(name, decoded))
    message = %{message | headers: headers}
    parse_headers(message, tail)
  end

  defp put_header(headers, "received" = key, value),
    do: Map.update(headers, key, [value], &[value | &1])

  defp put_header(headers, key, value),
    do: Map.put(headers, key, value)

  defp mark_multipart(message),
    do: Map.put(message, :multipart, multipart?(message.headers))

  defp parse_header_value(key, " " <> value),
    do: parse_header_value(key, value)

  defp parse_header_value(key, "\r" <> value),
    do: parse_header_value(key, value)

  defp parse_header_value(key, "\n" <> value),
    do: parse_header_value(key, value)

  defp parse_header_value(key, "\t" <> value),
    do: parse_header_value(key, value)

  defp parse_header_value("To", value),
    do: parse_recipient_value(value)

  defp parse_header_value("CC", value),
    do: parse_recipient_value(value)

  defp parse_header_value("From", value),
    do:
      parse_recipient_value(value)
      |> List.first()

  defp parse_header_value("Reply-To", value),
    do:
      parse_recipient_value(value)
      |> List.first()

  defp parse_header_value("Date", timestamp),
    do: to_datetime(timestamp)

  defp parse_header_value("Received", value),
    do: parse_received_value(value)

  defp parse_header_value("Content-Type", value) do
    case parse_structured_header_value(value) do
      [_ | _] = header -> header
      <<value::binary>> -> [value, {"charset", "us-ascii"}]
    end
  end

  defp parse_header_value("Content-Disposition", value),
    do: parse_structured_header_value(value)

  defp parse_header_value(_key, value),
    do: value

  # See https://tools.ietf.org/html/rfc2047
  defp parse_encoded_word(""), do: ""

  defp parse_encoded_word(<<"=?", value::binary>>) do
    case String.split(value, "?", parts: 4) do
      [_charset, encoding, encoded_string, <<"=", remainder::binary>>] ->
        decoded_string =
          case String.upcase(encoding) do
            "Q" ->
              Mail.Encoders.QuotedPrintable.decode(encoded_string)

            "B" ->
              Mail.Encoders.Base64.decode(encoded_string)
          end

        decoded_string <> parse_encoded_word(remainder)

      _ ->
        # Not an encoded word, moving on
        "=?" <> parse_encoded_word(value)
    end
  end

  defp parse_encoded_word(<<char::utf8, rest::binary>>),
    do: <<char::utf8, parse_encoded_word(rest)::binary>>

  defp parse_structured_header_value(string, value \\ nil, sub_types \\ [], acc \\ "")

  defp parse_structured_header_value("", value, [{key, nil} | sub_types], acc),
    do: [value | Enum.reverse([{key, acc} | sub_types])]

  defp parse_structured_header_value("", nil, [], acc),
    do: acc

  defp parse_structured_header_value("", value, sub_types, ""),
    do: [value | Enum.reverse(sub_types)]

  defp parse_structured_header_value("", value, [], acc),
    do: [value, String.trim(acc)]

  defp parse_structured_header_value("", value, sub_types, acc),
    do: parse_structured_header_value("", value, sub_types, String.trim(acc))

  defp parse_structured_header_value(<<"\"", rest::binary>>, value, sub_types, acc) do
    {string, rest} = parse_quoted_string(rest)
    parse_structured_header_value(rest, value, sub_types, <<acc::binary, string::binary>>)
  end

  defp parse_structured_header_value(<<";", rest::binary>>, nil, sub_types, acc),
    do: parse_structured_header_value(rest, acc, sub_types, "")

  defp parse_structured_header_value(<<";", rest::binary>>, value, [{key, nil} | sub_types], acc),
    do: parse_structured_header_value(rest, value, [{key, acc} | sub_types], "")

  defp parse_structured_header_value(<<"=", rest::binary>>, value, sub_types, acc),
    do: parse_structured_header_value(rest, value, [{key_to_atom(acc), nil} | sub_types], "")

  defp parse_structured_header_value(<<char::utf8, rest::binary>>, value, sub_types, acc),
    do: parse_structured_header_value(rest, value, sub_types, <<acc::binary, char::utf8>>)

  defp parse_quoted_string(string, acc \\ "")

  defp parse_quoted_string(<<"\\", char::utf8, rest::binary>>, acc),
    do: parse_quoted_string(rest, <<acc::binary, char::utf8>>)

  defp parse_quoted_string(<<"\"", rest::binary>>, acc), do: {acc, rest}

  defp parse_quoted_string(<<char::utf8, rest::binary>>, acc),
    do: parse_quoted_string(rest, <<acc::binary, char::utf8>>)

  defp parse_received_value(value) do
    case String.split(value, ";") do
      [value, ""] ->
        [value]

      [value, date] ->
        {value, date} =
          case extract_comment(remove_timezone_comment(date)) do
            {date, nil} -> {value, date}
            {date, comment} -> {"#{value} #{comment}", date}
          end

        [value, {"date", to_datetime(remove_excess_whitespace(date))}]

      value ->
        value
    end
  end

  defp remove_timezone_comment(date_string) do
    string_size = date_string |> String.trim_trailing() |> byte_size()

    if string_size > 6 do
      case binary_part(date_string, string_size - 6, 6) do
        <<" (", _::binary-size(3), ")">> -> binary_part(date_string, 0, string_size - 6)
        _ -> date_string
      end
    else
      date_string
    end
  end

  defp extract_comment(string, state \\ :value, value \\ "", comment \\ nil)
  defp extract_comment("", _, value, comment), do: {value, comment}

  defp extract_comment(<<"(", rest::binary>>, :value, value, nil),
    do: extract_comment(rest, :comment, value, "(")

  defp extract_comment(<<")", rest::binary>>, :comment, value, comment),
    do: extract_comment(rest, :value, value, comment <> ")")

  defp extract_comment(<<char::utf8, rest::binary>>, :value, value, comment),
    do: extract_comment(rest, :value, <<value::binary, char::utf8>>, comment)

  defp extract_comment(<<char::utf8, rest::binary>>, :comment, value, comment),
    do: extract_comment(rest, :comment, value, <<comment::binary, char::utf8>>)

  defp remove_excess_whitespace(<<>>), do: <<>>

  defp remove_excess_whitespace(<<"  ", rest::binary>>),
    do: remove_excess_whitespace(<<" ", rest::binary>>)

  defp remove_excess_whitespace(<<"\t", rest::binary>>),
    do: remove_excess_whitespace(<<" ", rest::binary>>)

  defp remove_excess_whitespace(<<char::utf8, rest::binary>>),
    do: <<char::utf8, remove_excess_whitespace(rest)::binary>>

  defp parse_body(%Mail.Message{multipart: true} = message, lines) do
    content_type = message.headers["content-type"]
    boundary = Mail.Proplist.get(content_type, "boundary")

    parts =
      lines
      |> extract_parts(boundary)
      |> Enum.map(fn part ->
        parse(part)
      end)

    Map.put(message, :parts, parts)
  end

  defp parse_body(%Mail.Message{} = message, []) do
    message
  end

  defp parse_body(%Mail.Message{} = message, lines) do
    decoded =
      lines
      |> join_body
      |> decode(message)

    Map.put(message, :body, decoded)
  end

  defp join_body(lines, acc \\ [])
  defp join_body([], acc), do: acc |> Enum.reverse() |> Enum.join("\r\n")
  defp join_body([""], acc), do: acc |> Enum.reverse() |> Enum.join("\r\n")
  defp join_body([head | tail], acc), do: join_body(tail, [head | acc])

  defp extract_parts(lines, boundary, acc \\ [], parts \\ nil)

  defp extract_parts([], _boundary, _acc, parts),
    do: Enum.reverse(List.wrap(parts))

  defp extract_parts(["--" <> boundary | tail], boundary, acc, nil),
    do: extract_parts(tail, boundary, acc, [])

  defp extract_parts(["--" <> boundary | tail], boundary, acc, parts),
    do: extract_parts(tail, boundary, [], [Enum.reverse(acc) | parts])

  defp extract_parts([<<"--" <> rest>> = line | tail], boundary, acc, parts) do
    if rest == boundary <> "--" do
      extract_parts([], boundary, [], [Enum.reverse(acc) | parts])
    else
      extract_parts(tail, boundary, [line | acc], parts)
    end
  end

  defp extract_parts([_line | tail], boundary, acc, nil),
    do: extract_parts(tail, boundary, acc, nil)

  defp extract_parts([head | tail], boundary, acc, parts),
    do: extract_parts(tail, boundary, [head | acc], parts)

  defp key_to_atom(key) do
    key
    |> String.trim()
    |> String.downcase()
    |> String.replace("-", "_")
  end

  defp multipart?(headers) do
    content_type = headers["content-type"]

    !!case content_type do
      nil -> nil
      type when is_binary(type) -> nil
      content_type -> Mail.Proplist.get(content_type, "boundary")
    end
  end

  defp decode(body, message) do
    body = String.trim_trailing(body)
    transfer_encoding = Mail.Message.get_header(message, "content-transfer-encoding")
    Mail.Encoder.decode(body, transfer_encoding)
  end
end