lib/unicode/grapheme_break.ex

defmodule Unicode.GraphemeClusterBreak do
  @moduledoc """
  Functions to introspect Unicode
  grapheme cluster breaks for binaries
  (Strings) and codepoints.

  """

  @behaviour Unicode.Property.Behaviour

  alias Unicode.Utils

  @grapheme_breaks Utils.grapheme_breaks()
                   |> Utils.remove_annotations()

  @doc """
  Returns the map of Unicode
  grapheme cluster breaks.

  The grapheme cluster break name is the map
  key and a list of codepoint
  ranges as tuples as the value.

  """

  def grapheme_breaks do
    @grapheme_breaks
  end

  @doc """
  Returns a list of known Unicode
  grapheme cluster break names.

  This function does not return the
  names of any grapheme cluster break aliases.

  """
  @known_grapheme_breaks Map.keys(@grapheme_breaks)
  def known_grapheme_breaks do
    @known_grapheme_breaks
  end

  @grapheme_break_alias Utils.property_value_alias()
                        |> Map.get("gcb")
                        |> Utils.invert_map()
                        |> Utils.atomize_values()
                        |> Utils.downcase_keys_and_remove_whitespace()
                        |> Utils.add_canonical_alias()

  @doc """
  Returns a map of aliases for
  Unicode grapheme cluster breaks.

  An alias is an alternative name
  for referring to a grapheme cluster break. Aliases
  are resolved by the `fetch/1` and
  `get/1` functions.

  """
  @impl Unicode.Property.Behaviour
  def aliases do
    @grapheme_break_alias
  end

  @doc """
  Returns the Unicode ranges for
  a given grapheme cluster break as a list of
  ranges as 2-tuples.

  Aliases are resolved by this function.

  Returns either `{:ok, range_list}` or
  `:error`.

  """
  @impl Unicode.Property.Behaviour
  def fetch(grapheme_break) when is_atom(grapheme_break) do
    Map.fetch(grapheme_breaks(), grapheme_break)
  end

  def fetch(grapheme_break) do
    grapheme_break = Utils.downcase_and_remove_whitespace(grapheme_break)
    grapheme_break = Map.get(aliases(), grapheme_break, grapheme_break)
    Map.fetch(grapheme_breaks(), grapheme_break)
  end

  @doc """
  Returns the Unicode ranges for
  a given grapheme cluster break as a list of
  ranges as 2-tuples.

  Aliases are resolved by this function.

  Returns either `range_list` or
  `nil`.

  """
  @impl Unicode.Property.Behaviour
  def get(grapheme_break) do
    case fetch(grapheme_break) do
      {:ok, grapheme_break} -> grapheme_break
      _ -> nil
    end
  end

  @doc """
  Returns the count of the number of characters
  for a given grapheme cluster break.

  ## Example

      iex> Unicode.GraphemeClusterBreak.count(:prepend)
      27

  """
  @impl Unicode.Property.Behaviour
  def count(grapheme_break) do
    with {:ok, grapheme_break} <- fetch(grapheme_break) do
      Enum.reduce(grapheme_break, 0, fn {from, to}, acc -> acc + to - from + 1 end)
    end
  end

  @doc """
  Returns the grapheme cluster break name(s) for the
  given binary or codepoint.

  In the case of a codepoint, a single
  grapheme cluster break name is returned.

  For a binary a list of distinct grapheme cluster break
  names represented by the graphemes in
  the binary is returned.

  """
  def grapheme_break(string) when is_binary(string) do
    string
    |> String.to_charlist()
    |> Enum.map(&grapheme_break/1)
    |> Enum.uniq()
  end

  for {grapheme_break, ranges} <- @grapheme_breaks do
    def grapheme_break(codepoint) when unquote(Utils.ranges_to_guard_clause(ranges)) do
      unquote(grapheme_break)
    end
  end

  def grapheme_break(codepoint) when is_integer(codepoint) and codepoint in 0..0x10FFFF do
    :other
  end
end