# Copyright(c) 2015-2023 ACCESS CO., LTD. All rights reserved.
use Croma
defmodule Antikythera.Xml do
el = inspect(__MODULE__.Element)
@moduledoc """
Convenient XML parser module wrapping [fast_xml](https://github.com/processone/fast_xml).
`decode/2` can parse XML into `#{el}.t`, and `encode/2` can serialize `#{el}.t` back to XML string.
`#{el}.t` is XML element data structure, and it is JSON-convertible struct.
You can safely convert them to JSON using `Poison.encode/2` while keeping order of appearance of children,
and also convert them back to `#{el}.t` with `Poison.decode/2` and `#{el}.new/1`.
Note that order of attributes will not be preserved, since it is not significant.
See [here](https://www.w3.org/TR/xml/#sec-starttags)
Namespace of tags (e.g. "ns" in `<ns:tag>`) are kept as is in `:name` of elements.
Namespace definitions (e.g. `xmlns:ns='http://example.com/ns'`) are treated as plain attributes,
and kept as is in `:attributes` of elements.
## `Access` behaviour
`#{el}` implements `Access` behaviour for convenient lookups and updates.
Following access patterns are available:
- `element[:name]`, `element[:attributes]`, `element[:children]`
- Fetch values of fields in dynamic lookup style.
- `element["@some_attr"]`
- Fetch value of "some_attr" in `:attributes` map.
- `element[:texts]`
- Fetch text (character data) children. It always returns list.
- `element["some_name"]`
- Fetch child elements with `name: "some_name"`. It always returns list.
You can also use these patterns in `Kernel.get_in/2` and its variants.
iex> xml = "<a>foo<b>bar</b>baz</a>"
iex> element = #{inspect(__MODULE__)}.decode!(xml)
%#{el}{name: "a", attributes: %{}, children: [
"foo",
%#{el}{name: "b", attributes: %{}, children: ["bar"]},
"baz",
]}
iex> get_in(element, [:texts])
["foo", "baz"]
iex> get_in(element, ["b", Access.at(0), :texts])
["bar"]
iex> get_and_update_in(element, [:children, Access.at(0)], fn _ -> :pop end)
{"foo",
%#{el}{name: "a", attributes: %{}, children: [
%#{el}{name: "b", attributes: %{}, children: ["bar"]},
"baz",
]}}
iex> update_in(element, [:children, Access.all()], fn
...> text when is_binary(text) -> %#{el}{name: "b", attributes: %{}, children: [text]}
...> e -> e
...> end)
%#{el}{name: "a", attributes: %{}, children: [
%#{el}{name: "b", attributes: %{}, children: ["foo"]},
%#{el}{name: "b", attributes: %{}, children: ["bar"]},
%#{el}{name: "b", attributes: %{}, children: ["baz"]},
]}
iex> update_in(element, ["@id"], fn _ -> "001" end)
%#{el}{name: "a", attributes: %{"id" => "001"}, children: [
"foo",
%#{el}{name: "b", attributes: %{}, children: ["bar"]},
"baz",
]}
Notes on updating with `Kernel.get_and_update_in/3` and its variants:
- Struct fields are static and cannot be popped.
- Custom access keys except "@some_attr" cannot be used in updating.
Use `:children` instead, in order to update children while preserving order of appearance.
"""
alias Croma.Result, as: R
defmodule Content do
alias Antikythera.Xml.Element
@type t :: String.t() | Element.t()
defun valid?(v :: term) :: boolean do
is_binary(v) or Element.valid?(v)
end
defun new(v :: term) :: R.t(t) do
s when is_binary(s) -> {:ok, s}
m when is_map(m) -> Element.new(m)
_ -> {:error, {:invalid_value, [__MODULE__]}}
end
end
defmodule Element do
use Croma.Struct,
recursive_new?: true,
fields: [
name: Croma.String,
attributes: Croma.Map,
children: Croma.TypeGen.list_of(Content)
]
@behaviour Access
# Access behaviour implementations
@impl true
def fetch(%__MODULE__{name: n}, :name), do: {:ok, n}
def fetch(%__MODULE__{attributes: a}, :attributes), do: {:ok, a}
def fetch(%__MODULE__{children: c}, :children), do: {:ok, c}
def fetch(%__MODULE__{attributes: a}, "@" <> attribute), do: Map.fetch(a, attribute)
def fetch(%__MODULE__{children: c}, :texts), do: {:ok, Enum.filter(c, &is_binary/1)}
def fetch(%__MODULE__{children: c}, key) when is_binary(key),
do: {:ok, Enum.filter(c, &has_name?(&1, key))}
def fetch(%__MODULE__{}, _), do: :error
defp has_name?(%__MODULE__{name: n}, n), do: true
defp has_name?(_, _), do: false
@impl true
def get_and_update(%__MODULE__{} = e, key, f) when key in [:name, :attributes, :children] do
case e |> Map.fetch!(key) |> f.() do
{get_value, new_value} -> {get_value, update_struct_field(e, key, new_value)}
:pop -> raise "Cannot pop struct field!"
end
end
def get_and_update(%__MODULE__{attributes: as} = e, "@" <> attribute, f) do
current_value = Map.get(as, attribute)
case f.(current_value) do
{get_value, new_attr} when is_binary(new_attr) ->
{get_value, %__MODULE__{e | attributes: Map.put(as, attribute, new_attr)}}
:pop ->
{current_value, %__MODULE__{e | attributes: Map.delete(as, attribute)}}
end
end
def get_and_update(_e, key, _f) do
raise ~s[#{inspect(__MODULE__)}.get_and_update/3 only accepts :name, :attributes, :children or "@attribute" as key for updating, got: #{inspect(key)}]
end
defp update_struct_field(%__MODULE__{} = e, :name, new_name) when is_binary(new_name),
do: %__MODULE__{e | name: new_name}
defp update_struct_field(%__MODULE__{} = e, :attributes, new_attrs) when is_map(new_attrs),
do: %__MODULE__{e | attributes: new_attrs}
defp update_struct_field(%__MODULE__{} = e, :children, new_children)
when is_list(new_children),
do: %__MODULE__{e | children: new_children}
@impl true
def pop(element, key) do
get_and_update(element, key, fn _ -> :pop end)
end
end
@type decode_option :: {:trim, boolean}
@doc """
Reads an XML string and parses it into `#{el}.t`.
Comments and header will be discarded.
It can read XHTML document as long as they are well-formatted,
though it does not understand Document Type Definition (DTD, header line with "<!DOCTYPE html PUBLIC ..."),
so you must remove them.
It tries to read a document with UTF-8 encoding, regardless of "encoding" attribute in the header.
Options:
- `:trim` - Drop whitespace-only texts. Default `false`.
- There are no universal way to distinguish significant and insignificant whitespaces,
so this option may alter the meaning of original document. Use with caution.
- In [W3C recommendation](https://www.w3.org/TR/REC-xml/#sec-white-space),
it is stated that whitespace texts (character data) are basically significant and must be preserved.
"""
defun decode(xml_string :: v[String.t()], opts :: v[[decode_option]] \\ []) :: R.t(Element.t()) do
case :fxml_stream.parse_element(xml_string) do
{:error, _} = e -> e
record -> from_record(record, Keyword.get(opts, :trim, false)) |> R.wrap_if_valid(Element)
end
end
defunp from_record({:xmlel, name, attrs, children} :: :fxml.xmlel(), trim :: v[boolean]) ::
Element.t() do
%Element{
name: name,
attributes: Map.new(attrs),
children: children(children, trim, [])
}
end
defp children([], _, acc), do: Enum.reverse(acc)
defp children([{:xmlcdata, text} | tail], true, acc),
do: children(tail, true, cons_trimmed(text, acc))
defp children([{:xmlcdata, text} | tail], _, acc), do: children(tail, false, [text | acc])
defp children([{:xmlel, _, _, _} = el | tail], trim, acc),
do: children(tail, trim, [from_record(el, trim) | acc])
defp cons_trimmed(text, acc) do
case String.trim(text) do
# Nothing other than whitespaces; must be indents
"" -> acc
# Otherwise, keep leading/trailing whitespaces since they may have meanings
_ -> [text | acc]
end
end
@xml_header ~S(<?xml version='1.0' encoding='UTF-8'?>)
@type encode_option :: {:pretty | :with_header, boolean}
@doc """
Serializes `#{el}.t` into XML string.
Specifications:
- Trailing newline will not be generated.
- All single- and double-quotations in attribute values or entity values are escaped to
`'` and `"` respectively.
- All attribute values are SINGLE-quoted.
- Does not insert a whitespace before "/>" in element without children.
Options:
- `:pretty` - Pretty print with 2-space indents. Default `false`.
- Similar to `:trim` option in `decode/2`, inserted whitespaces may be significant,
thus it can alter meaning of original document. Use with caution.
- It does not insert whitespaces to elements with [mixed-content](https://www.w3.org/TR/REC-xml/#sec-mixed-content)
and their descendants, in order to reduce probability to alter the meaning of original document.
- `:with_header` - Prepend `#{@xml_header}\\n`. Default `false`.
"""
defun encode(xml_element :: v[Element.t()], opts :: v[[encode_option]] \\ []) :: String.t() do
body =
xml_element |> to_record(Keyword.get(opts, :pretty, false), 0) |> :fxml.element_to_binary()
case opts[:with_header] do
true -> "#{@xml_header}\n" <> body
_ -> body
end
end
defunp to_record(content :: Content.t(), pretty? :: boolean, level :: non_neg_integer) ::
:fxml.xmlel() do
%Element{name: n, attributes: a, children: c}, true, level ->
{:xmlel, n, Map.to_list(a), prettified_children(c, level)}
%Element{name: n, attributes: a, children: c}, _, _ ->
{:xmlel, n, Map.to_list(a), Enum.map(c, &to_record(&1, false, 0))}
text, _, _ when is_binary(text) ->
{:xmlcdata, text}
end
defp prettified_children([], _level), do: []
# If there is only a single text child, directly produce non-prettified record
defp prettified_children([text], _level) when is_binary(text), do: [{:xmlcdata, text}]
defp prettified_children(children, level),
do: map_to_record_and_interleave_whitespaces(children, level)
@indent_unit " "
defp map_to_record_and_interleave_whitespaces(children, level) do
{children, mixed?} = map_to_record(children, level)
interleave_whitespaces(children, level, mixed?)
end
defp map_to_record(children, level) do
Enum.map_reduce(children, false, fn
text, _mixed? when is_binary(text) -> {{:xmlcdata, text}, true}
%Element{} = e, mixed? -> {to_record(e, !mixed?, level + 1), mixed?}
end)
end
defp interleave_whitespaces(children, _level, true), do: children
defp interleave_whitespaces(children, level, false) do
child_indent = {:xmlcdata, "\n" <> String.duplicate(@indent_unit, level + 1)}
close_tag_indent = {:xmlcdata, "\n" <> String.duplicate(@indent_unit, level)}
Enum.flat_map(children, &[child_indent, &1]) ++ [close_tag_indent]
end
R.define_bang_version_of(decode: 1, decode: 2)
end