lib/xml_query.ex

defmodule XmlQuery do
  # @related [tests](test/xml_query_test.exs)

  @moduledoc """
  A concise API for querying XML. XML parsing is handled by Erlang/OTP’s built-in
  [xmerl](https://www.erlang.org/doc/man/xmerl) library.

  We created a related library called [HtmlQuery](https://hexdocs.pm/html_query/readme.html) which has the same API but
  is used for querying HTML. You can read more about them in
  [Querying HTML and XML in Elixir with HtmlQuery and XmlQuery](https://eahanson.com/articles/html-query-xml-query).

  ## Data types

  All functions accept XML in the form of a string, an `Xmerl.xml_attribute`, an `Xmerl.xml_document`, an
  `Xmerl.xml_element`, an `Xmler.xml_text`, an `XmlQuery.Element`, or anything that implements the `String.Chars`
  protocol.

  ## Query functions

  | `all/2`   | return all elements matching the selector                   |
  | `find/2`  | return the first element that matches the selector          |
  | `find!/2` | return the only element that matches the selector, or raise |

  ## Extraction functions

  | `attr/2` | returns the attribute value as a string      |
  | `text/1` | returns the text contents as a single string |

  ## Parsing & utility functions

  | `parse/1` | parses XML into an `XmlQuery.Element`, `XmlQuery.Attribute`, or XmlQuery.Text.t() |
  | `pretty/1` | prettifies XML |

  ## Alias

  If you use XmlQuery a lot, you may want to alias it to the recommended shortcut "Xq":

  ```elixir
  alias XmlQuery, as: Hq
  ```
  """

  import Record
  alias XmlQuery.QueryError
  alias XmlQuery.Xmerl
  require XmlQuery.Xmerl

  @type xml() :: xml_binary() | xml_document() | xml_element() | XmlQuery.Element.t() | String.Chars.t()
  @type xml_attribute() :: Xmerl.xml_attribute()
  @type xml_binary() :: binary()
  @type xml_document() :: Xmerl.xml_document()
  @type xml_element() :: Xmerl.xml_element()
  @type xml_text() :: Xmerl.xml_text()
  @type xpath() :: binary() | charlist()

  @module_name __MODULE__ |> Module.split() |> Enum.join(".")

  defguard is_xml_struct(struct)
           when is_struct(struct) and
                  struct.__struct__ in [XmlQuery.Attribute, XmlQuery.Element, XmlQuery.Text]

  @doc """
  Finds all elements in an XML document that match `xpath`, returning a list of records.
  Depending on the given xpath, the type of the record may be different.

  ``` elixir
  iex> xml = ~s|<cart id="123"> <fruit name="apple" color="red"/> <fruit name="banana" color="yellow"/> </cart>|
  iex> XmlQuery.all(xml, "//fruit") |> Enum.map(&to_string/1)
  ["<fruit name=\\"apple\\" color=\\"red\\"/>",
   "<fruit name=\\"banana\\" color=\\"yellow\\"/>"]
  ```
  """
  @spec all(xml(), xpath()) :: [XmlQuery.Element.t()]
  def all(xml, xpath) when is_binary(xpath),
    do: xml |> all(String.to_charlist(xpath))

  def all(xml, xpath) when is_binary(xml) or is_tuple(xml),
    do: xml |> parse() |> all(xpath)

  def all(xml, xpath) when is_struct(xml),
    do: :xmerl_xpath.string(xpath, xml.shadows) |> Enum.map(&into/1)

  @doc """
  Returns the value of `attr` from the outermost element of `xml`.

  ``` elixir
  iex> xml = ~s|<cart id="123"> <fruit name="apple" color="red"/> <fruit name="banana" color="yellow"/> </cart>|
  iex> XmlQuery.attr(xml, :id)
  "123"
  ```
  """
  @spec attr(xml(), String.t()) :: XmlQuery.Attribute.t() | nil
  def attr(xml, attr) do
    case xml
         |> parse()
         |> first!("Consider using Enum.map(xml, &#{@module_name}.attr(&1, #{inspect(attr)}))")
         |> find("@#{attr}") do
      %XmlQuery.Attribute{value: value} -> to_string(value)
      nil -> nil
    end
  end

  @doc """
  Finds the first element, attribute, or element text in `xml` that matches `xpath`.

  ``` elixir
  iex> alias XmlQuery, as: Xq
  iex> xml = \"""
  ...> <?xml version="1.0"?>
  ...> <root><child property="oldest" /><child property="youngest" /></root>
  ...> \"""
  iex> %Xq.Element{name: :child, attributes: [%Xq.Attribute{value: ~c"oldest"}]} = Xq.find(xml, "//child")

  ```
  """
  @spec find(xml(), xpath()) :: XmlQuery.Element.t() | XmlQuery.Attribute.t() | XmlQuery.Text.t() | nil
  def find(xml, xpath),
    do: xml |> all(xpath) |> List.first()

  @doc """
  Like `find/2` but raises unless exactly one node is found.
  """
  @spec find!(xml(), xpath()) :: XmlQuery.Element.t() | XmlQuery.Attribute.t() | XmlQuery.Text.t()
  def find!(xml, xpath),
    do: all(xml, xpath) |> first!("XPath: #{xpath}")

  @doc """
  Parses an XML document using `:xmerl_scan.string/2`, returning an `XmlQuery.Element` struct.

  Given an xml tuple that has already been created by `:xmerl`, wraps the tuple in an
  `XmlQuery`-specific struct.

  ``` elixir
  iex> xml = \"""
  ...> <?xml version="1.0"?>
  ...> <root />
  ...> \"""
  iex> %Xq.Element{name: :root} = XmlQuery.parse(xml)

  iex> xml = \"""
  ...> <?xml version="1.0"?>
  ...> <root property="root-value" />
  ...> \"""
  iex> %Xq.Attribute{name: :property, value: ~c"root-value"} = XmlQuery.find(xml, "//root/@property") |> XmlQuery.parse()

  ```
  """
  @spec parse(xml()) :: XmlQuery.Element.t() | XmlQuery.Attribute.t() | XmlQuery.Text.t()
  def parse(node) when is_xml_struct(node),
    do: node

  def parse([node | _] = list) when is_xml_struct(node),
    do: list

  def parse(xml) when is_tuple(xml),
    do: xml |> into()

  def parse(xml) when is_binary(xml) do
    {doc, []} =
      xml
      |> String.to_charlist()
      |> :xmerl_scan.string(acc_fun: &accumulate_xml/3, quiet: true, space: :normalize, xmlbase: ~c"/")

    into(doc)
  end

  def parse(%_{} = xml),
    do: xml |> implements!(String.Chars) |> to_string() |> parse()

  @doc """
  Returns `xml` as a prettified string.

  Elements and text nodes are sorted and indented relative to parent elements.
  """
  @spec pretty(xml()) :: binary()
  def pretty(node)
      when is_struct(node, XmlQuery.Element) or is_struct(node, XmlQuery.Attribute) or is_struct(node, XmlQuery.Text),
      do: node.__struct__.pretty(node)

  def pretty(xml) when is_binary(xml) or is_tuple(xml),
    do: xml |> parse() |> pretty()

  @doc """
  Returns the text value of `xml`.

  ``` elixir
  iex> xml = "<name><first>Alice</first><middle>A.</middle><last>Aliceston</last></name>"
  iex> XmlQuery.text(xml)
  "Alice A. Aliceston"
  iex> xml |> XmlQuery.find("//name/first") |> XmlQuery.text()
  "Alice"
  ```
  """
  @spec text(xml()) :: binary()
  def text(xml) do
    case xml
         |> parse()
         |> first!("Consider using Enum.map(xml, &#{@module_name}.text/1)") do
      %XmlQuery.Element{shadows: doc} ->
        :xmerl_xpath.string(~c"//text()", doc)
        |> Enum.reduce("", fn node, acc ->
          case XmlQuery.Text.to_string(node) do
            "" -> acc
            text -> String.trim(acc <> " " <> text)
          end
        end)
    end
  end

  # # #

  @doc false
  def into(nil), do: nil

  def into(attribute) when is_record(attribute, :xmlAttribute),
    do: XmlQuery.Attribute.new(attribute)

  def into(element) when is_record(element, :xmlElement),
    do: XmlQuery.Element.new(element)

  def into(text) when is_record(text, :xmlText),
    do: XmlQuery.Text.new(text)

  # # #

  defp accumulate_xml({:xmlText, _, _, _, ~c" ", _} = text, acc, str) do
    {acc, XmlQuery.Xmerl.xmlText(text, :pos), str}
  end

  defp accumulate_xml(node, acc, str),
    do: {[node | acc], str}

  defp first!([], hint) do
    raise(QueryError, """
    Expected a single XML element but found none.

    #{hint}
    """)
  end

  defp first!([element], _hint),
    do: element

  defp first!(node, _hint) when is_xml_struct(node),
    do: node

  defp first!(_xml, hint) do
    raise QueryError, """
    Expected a single XML node but found multiple:

    #{hint}
    """
  end

  defp implements!(x, protocol) do
    if protocol.impl_for(x) == nil,
      do: raise("Expected #{inspect(x)} to implement protocol #{inspect(protocol)}"),
      else: x
  end
end