lib/makeup/lexers/html_lexer.ex

defmodule Makeup.Lexers.HTMLLexer do
  @moduledoc """
  Lexer for the HTML language to be used
  with the Makeup package.
  """
  @behaviour Makeup.Lexer

  import NimbleParsec
  import Makeup.Lexer.Combinators
  import Makeup.Lexer.Groups
  import Makeup.Lexers.HTMLLexer.Combinators
  alias Makeup.Lexers.HTMLLexer.HTMLElements
  alias Makeup.Lexers.HTMLLexer.HTMLAttributes

  @keywords (HTMLElements.get_elements() ++
               HTMLAttributes.get_attributes() ++ HTMLAttributes.get_event_handler_attributes())
            |> Enum.sort_by(&String.length/1)
            |> Enum.reverse()

  @attributes (HTMLAttributes.get_attributes() ++ HTMLAttributes.get_event_handler_attributes())
              |> MapSet.new()
              |> MapSet.difference(MapSet.new(HTMLElements.get_elements()))
              |> MapSet.to_list()

  ###################################################################
  # Step #1: tokenize the input (into a list of tokens)
  ###################################################################

  # Whitespaces
  wspace = ascii_string([?\r, ?\s, ?\n, ?\f], min: 1)

  whitespace =
    wspace
    |> token(:whitespace)

  # Doctype
  legacy_doctype_string =
    wspace
    |> optional()
    |> concat(anycase_string("SYSTEM"))
    |> optional(wspace)
    |> concat(
      choice([
        string("\"about:legacy-compat\""),
        string("'about:legacy-compat'")
      ])
    )

  doctype =
    "<!"
    |> string()
    |> concat(anycase_string("DOCTYPE"))
    |> optional(wspace)
    |> concat(anycase_string("html"))
    |> optional(legacy_doctype_string)
    |> optional(wspace)
    |> concat(string(">"))
    |> token(:keyword)

  # Operators
  operators =
    "="
    |> string()
    |> token(:operator)

  # Combinators that highlight expressions surrounded by a pair of delimiters.
  comment_tag = many_surrounded_by(parsec(:root_element), "<!--", "-->")

  # Single punctuation symbols
  open_tag =
    "<"
    |> string()
    |> token(:punctuation)

  close_tag =
    ">"
    |> string()
    |> token(:punctuation)

  close_self_tag =
    "/>"
    |> string()
    |> token(:punctuation)

  open_closing_tag =
    "</"
    |> string()
    |> token(:punctuation)

  # Keywords
  keywords =
    Enum.map(
      @keywords,
      &keyword/1
    )

  # Unmatched
  insensitive_char = utf8_char([]) |> token(:char)

  # Tag the tokens with the language name.
  # This makes it easier to postprocess files with multiple languages.
  @doc false
  def __as_html_language__({ttype, meta, value}) do
    {ttype, Map.put(meta, :language, :html), value}
  end

  root_element_combinator =
    choice(
      [
        # Doctype
        doctype,
        # Operators
        operators,
        # Delimiters
        comment_tag,
        open_closing_tag,
        open_tag,
        close_self_tag,
        close_tag,
        # Whitespaces
        whitespace
      ] ++
        keywords ++
        [
          # Unmatched
          insensitive_char
        ]
    )

  ##############################################################################
  # Semi-public API: these two functions can be used by someone who wants to
  # embed this lexer into another lexer, but other than that, they are not
  # meant to be used by end-users
  ##############################################################################
  @inline Application.compile_env(:makeup_html, :inline, false)

  # @impl Makeup.Lexer
  defparsec(
    :root_element,
    root_element_combinator |> map({__MODULE__, :__as_html_language__, []}),
    inline: @inline
  )

  # @impl Makeup.Lexer
  defparsec(
    :root,
    repeat(parsec(:root_element)),
    inline: @inline
  )

  ###################################################################
  # Step #2: postprocess the list of tokens
  ###################################################################

  ###
  # Merges a list of tokens into a single 'string' token
  ###
  defp merge_string([{_, _, string} | tokens], result) when is_list(string),
    do: merge_string(tokens, result ++ string)

  defp merge_string([{_, _, string} | tokens], result) when is_binary(string),
    do: merge_string(tokens, result ++ [string])

  defp merge_string([{_, _, string} | tokens], result) when is_integer(string),
    do: merge_string(tokens, result ++ [string])

  defp merge_string([], []), do: []
  defp merge_string([], result), do: [{:string, %{language: :html}, result}]

  defp merge_string(stringlist), do: stringlist |> merge_string([])

  ###
  # Converts traces of the form [char]+ into a single string
  ###
  defp char_stringify(tokens), do: tokens |> char_stringify([], [])

  defp char_stringify([{:char, _attr, _value} = token | tokens], charlist, result),
    do: char_stringify(tokens, charlist ++ [token], result)

  defp char_stringify([token | tokens], charlist, result),
    do: char_stringify(tokens, [], result ++ merge_string(charlist) ++ [token])

  defp char_stringify([], charlist, result), do: result ++ merge_string(charlist)

  ###
  # Converts the proper keywords into attributes
  ###
  defp attributify(tokens),
    do: tokens |> attributify(false, [])

  defp attributify(
         [
           {:keyword, attr, value},
           {:operator, _, _} = operator,
           {_, attr2, value2} | tokens
         ],
         flag,
         result
       ),
       do:
         attributify(
           tokens,
           flag,
           result ++ [{:name_attribute, attr, value}, operator, {:string, attr2, value2}]
         )

  defp attributify(
         [
           {:punctuation, _, "<"} = punctuation,
           {:keyword, _, _} = keyword,
           {:whitespace, _, _} = whitespace | tokens
         ],
         _,
         result
       ),
       do:
         attributify(
           tokens,
           true,
           result ++
             [punctuation, keyword, whitespace]
         )

  defp attributify([{:punctuation, _, ">"} = punctuation | tokens], true, result),
    do: attributify(tokens, false, result ++ [punctuation])

  defp attributify([{:keyword, attr, value} | tokens], true, result),
    do: attributify(tokens, true, result ++ [{:name_attribute, attr, value}])

  defp attributify([{:keyword, attr, value} | tokens], flag, result) do
    attribute =
      if Enum.member?(@attributes, value),
        do: {:name_attribute, attr, value},
        else: {:keyword, attr, value}

    attributify(
      tokens,
      flag,
      result ++
        [attribute]
    )
  end

  defp attributify([token | tokens], flag, result),
    do: attributify(tokens, flag, result ++ [token])

  defp attributify([], _, result), do: result

  ###
  # Converts traces of the forms
  # string[keyword]+
  # keyword[keyword]+
  # [keyword]+string
  # into a single string
  ###
  defp keyword_stringify(tokens), do: tokens |> keyword_stringify([], [])

  defp keyword_stringify(
         [{:string, _, _} = string, {:keyword, _, _} = keyword | tokens],
         queue,
         result
       ),
       do: keyword_stringify(tokens, queue ++ [string, keyword], result)

  defp keyword_stringify(
         [{:keyword, _, _} = keyword, {:string, _, _} = string | tokens],
         queue,
         result
       ),
       do: keyword_stringify(tokens, queue ++ [keyword, string], result)

  defp keyword_stringify(
         [{:keyword, _, _} = keyword1, {:keyword, _, _} = keyword2 | tokens],
         queue,
         result
       ),
       do: keyword_stringify(tokens, queue ++ [keyword1, keyword2], result)

  defp keyword_stringify([{:keyword, _, _} = token | tokens], [], result),
    do: keyword_stringify(tokens, [], result ++ [token])

  defp keyword_stringify([{:keyword, _, _} = token | tokens], queue, result),
    do: keyword_stringify(tokens, queue ++ [token], result)

  defp keyword_stringify([], queue, result),
    do: result ++ merge_string(queue)

  defp keyword_stringify([{:string, _, _} = token | tokens], queue, result),
    do: keyword_stringify(tokens, [], result ++ merge_string(queue ++ [token]))

  defp keyword_stringify([token | tokens], queue, result),
    do: keyword_stringify(tokens, [], result ++ merge_string(queue) ++ [token])

  ###
  # Converts traces of the form "<!--"[token]*"-->" into a comment
  ###
  defp commentify(tokens), do: tokens |> commentify({nil, []}, [])

  defp commentify([{:punctuation, group, "<!--"} = token | tokens], {nil, []}, result),
    do: commentify(tokens, {group, [token]}, result)

  defp commentify([{:punctuation, group, "-->"} = token | tokens], {group, queue}, result) do
    [{_type, _attr, string}] = merge_string(queue ++ [token])

    comment_content =
      string
      |> List.to_string()
      |> String.replace_prefix("<!--", "")
      |> String.replace_suffix("-->", "")

    if String.starts_with?(comment_content, [">", "->"]) or
         String.contains?(comment_content, ["<!--", "-->", "--!>"]) or
         String.ends_with?(comment_content, "<!-"),
       do:
         commentify(
           tokens,
           {nil, []},
           result ++ [{:string, %{language: :html}, string}]
         ),
       else:
         commentify(
           tokens,
           {nil, []},
           result ++ [{:comment, %{language: :html}, string}]
         )
  end

  defp commentify([], {_group, queue}, result),
    do: result ++ merge_string(queue)

  defp commentify([token | tokens], {nil, _}, result),
    do: commentify(tokens, {nil, []}, result ++ [token])

  defp commentify([token | tokens], {group, queue}, result),
    do: commentify(tokens, {group, queue ++ [token]}, result)

  ##
  # Converts the content of an element into a string
  ##
  defp element_stringify(tokens), do: tokens |> element_stringify(false, [], [])

  defp element_stringify(
         [{:punctuation, _, ">"} = punctuation | tokens],
         _,
         queue,
         result
       ),
       do: element_stringify(tokens, true, [], result ++ merge_string(queue) ++ [punctuation])

  # We respect the comments
  defp element_stringify(
         [{:comment, _, _} = comment | tokens],
         _,
         queue,
         result
       ),
       do: element_stringify(tokens, true, [], result ++ merge_string(queue) ++ [comment])

  defp element_stringify(
         [{:punctuation, _, "</"} = punctuation | tokens],
         true,
         queue,
         result
       ),
       do: element_stringify(tokens, false, [], result ++ merge_string(queue) ++ [punctuation])

  defp element_stringify(
         [{:punctuation, _, "<"} = punctuation | tokens],
         true,
         queue,
         result
       ),
       do: element_stringify(tokens, false, [], result ++ merge_string(queue) ++ [punctuation])

  defp element_stringify([token | tokens], false, _, result),
    do: element_stringify(tokens, false, [], result ++ [token])

  defp element_stringify([token | tokens], true, queue, result),
    do: element_stringify(tokens, true, queue ++ [token], result)

  defp element_stringify([], _, queue, result),
    do: result ++ queue

  @impl Makeup.Lexer
  def postprocess(tokens, _opts \\ []) do
    tokens
    |> char_stringify()
    |> commentify()
    |> keyword_stringify()
    |> attributify()
    |> element_stringify()
  end

  #######################################################################
  # Step #3: highlight matching delimiters
  #######################################################################
  @impl Makeup.Lexer
  defgroupmatcher(:match_groups,
    comment_tag: [
      open: [[{:punctuation, _, "<!--"}]],
      close: [[{:punctuation, _, "-->"}]]
    ],
    start_closing_tag: [
      open: [[{:punctuation, _, "</"}]],
      close: [[{:punctuation, _, ">"}]]
    ],
    start_tag: [
      open: [[{:punctuation, _, "<"}]],
      close: [[{:punctuation, _, ">"}], [{:punctuation, _, "/>"}]]
    ]
  )

  # Finally, the public API for the lexer
  @impl Makeup.Lexer
  def lex(text, opts \\ []) do
    group_prefix = Keyword.get(opts, :group_prefix, random_prefix(10))
    {:ok, tokens, "", _, _, _} = root(text)

    tokens
    |> postprocess()
    |> match_groups(group_prefix)
  end
end