lib/unicode/case/mapping.ex

Select File:
lib/unicode/case/mapping.ex

defmodule Unicode.String.Case.Mapping do
  @moduledoc """
  The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm
  defines the process and data to transform text into upper case, lower case or title case.

  Since most languages are not bicameral, characters which have no appropriate mapping remain unchanged.

  Three case mapping functions are provided as a public API which have their implementations in this module:

  * `Unicode.String.upcase/2` which will convert text to upper case characters.
  * `Unicode.String.downcase/2` which will convert text to lower case characters.
  * `Unicode.String.titlecase/2` which will convert text to title case.  Title case means
    that the first character or each word is set to upper case and all other characters in
    the word are set to lower case. `Unicode.String.split/2` is used to split the string
    into words before title casing.

  Each function operates in a locale-aware manner implementing some basic capabilities:

  * Casing rules for the Turkish dotted capital `I` and dotless small `i`.
  * Casing rules for the retention of dots over `i` for Lithuanian letters with additional accents.
  * Titlecasing of IJ at the start of words in Dutch.
  * Removal of accents when upper casing letters in Greek.

  There are other casing rules that are not currently implemented such as:

  * Titlecasing of second or subsequent letters in words in orthographies that include
    caseless letters such as apostrophes.
  * Uppercasing of U+00DF `ß` latin small letter sharp `s` to U+1E9E `ẞ` latin capital letter
    sharp `s`.

  ### Examples

      # Basic case transformation
      iex> Unicode.String.Case.Mapping.upcase("the quick brown fox")
      "THE QUICK BROWN FOX"

      # Dotted-I in Turkish and Azeri
      iex> Unicode.String.Case.Mapping.upcase("Diyarbakır", :tr)
      "DİYARBAKIR"

      # Upper case in Greek removes diacritics
      iex> Unicode.String.Case.Mapping.upcase("Πατάτα, Αέρας, Μυστήριο", :el)
      "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ"

      # Lower case Greek with a final sigma
      iex> Unicode.String.Case.Mapping.downcase("ὈΔΥΣΣΕΎΣ", :el)
      "ὀδυσσεύς"

      # Title case Dutch with leading dipthong
      iex> Unicode.String.Case.Mapping.titlecase("ijsselmeer", :nl)
      "IJsselmeer"

  """

  alias Unicode.Utils

  @sigma 0x03A3
  @lower_sigma <<0x03C3::utf8>>
  @sigma_byte_size byte_size(<<@sigma::utf8>>)

  # See table Table 3-17 of https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
  # for details of the contexts

  # These regexes can probably be converted to another form
  # which may further enable binary optimmization.
  @final_sigma_before Unicode.Regex.compile!("\\p{cased}(\\p{Case_Ignorable})*")
  @final_sigma_after Unicode.Regex.compile!("(\\p{Case_Ignorable})*\\p{cased}")

  @after_soft_dotted Unicode.Regex.compile!("[\\p{Soft_Dotted}]([^\\p{ccc=230}\\p{ccc=0}])*")
  @more_above Unicode.Regex.compile!("[^\\p{ccc=230}\\p{ccc=0}]*[\\p{ccc=230}]")
  @before_dot Unicode.Regex.compile!("([^\\p{ccc=230}\\p{ccc=0}])*[\u0307]")
  @after_i Unicode.Regex.compile!("[I]([^\\p{ccc=230}\\p{ccc=0}])*")

  utf8_bytes_for_codepoint = fn codepoint ->
    byte_size(<<codepoint::utf8>>)
  end

  define_casing_function = fn
    casing, codepoint, replace, language, nil ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<unquote(codepoint)::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
        casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
      end

    casing, codepoint, replace, language, "final_sigma" ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<@sigma::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)

        if Regex.match?(@final_sigma_before, prior) && !Regex.match?(@final_sigma_after, rest) do
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
        else
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [@lower_sigma | acc])
        end
      end

    casing, codepoint, replace, language, "not_before_dot" ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<unquote(codepoint)::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)

        if !Regex.match?(@before_dot, prior) do
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
        else
          this = casing(<<unquote(codepoint)::utf8>>, <<unquote(codepoint)::utf8>>, unquote(casing), :any, 0, acc)
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
        end
      end

    casing, codepoint, replace, language, "more_above" ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<unquote(codepoint)::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)

        if Regex.match?(@more_above, rest) do
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
        else
          this = casing(<<unquote(codepoint)::utf8>>, <<unquote(codepoint)::utf8>>, unquote(casing), :any, 0, acc)
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
        end
      end

    casing, codepoint, replace, language, "after_soft_dotted" ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<unquote(codepoint)::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)

        if Regex.match?(@after_soft_dotted, prior) do
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
        else
          this = casing(<<unquote(codepoint)::utf8>>, <<unquote(codepoint)::utf8>>, unquote(casing), :any, 0, acc)
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
        end
      end

    casing, codepoint, replace, language, "after_i" ->
      codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
      replacement = :unicode.characters_to_binary(replace)

      defp casing(string, <<unquote(codepoint)::utf8, rest::binary>>, unquote(casing), unquote(language), bytes_so_far, acc) do
        <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
        bytes_so_far = bytes_so_far + unquote(codepoint_bytes)

        if Regex.match?(@after_i, prior) do
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [unquote(replacement) | acc])
        else
          this = casing(<<unquote(codepoint)::utf8>>, <<unquote(codepoint)::utf8>>, unquote(casing), :any, 0, acc)
          casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
        end
      end
  end

  @doc """
  Replace lower case characters with their
  uppercase equivalents.

  Lower case characters are replaced with their
  upper case equivalents. All other characters
  remain unchanged.

  For the Greek language (`:el`), all accents are
  removed prior to capitalization as is the normal
  practise for this language.

  """
  def upcase(string, language \\ :any)

  def upcase(string, :el) do
    Unicode.String.Case.Mapping.Greek.upcase(string)
  end

  def upcase(string, language) when is_atom(language) do
    casing(string, string, :upcase,language, 0, [])
  end

  @doc """
  Replace upper case characters with their
  lower case equivalents.

  """
  def downcase(string, language \\ :any)

  def downcase(string, language) when is_atom(language) do
    casing(string, string, :downcase, language, 0, [])
  end

  @doc """
  Apply to Unicode title case algorithm.

  """
  def titlecase(string, language \\ :any)

  def titlecase(<<i::size(8), j::size(8), rest::binary>>, :nl)
      when i in [?i, ?I] and j in [?j, ?J] do
    "IJ" <> casing(rest, rest, :downcase, :any, 0, [])
  end

  def titlecase(<<first::utf8, rest::binary>>, language) when is_atom(language) do
    casing(<<first::utf8>>, <<first::utf8>>, :titlecase, language, 0, []) <> downcase(rest, language)
  end

  # These next four function clauses optimze for ASCII characters.
  # We need to omit the `i` from all ranges since in Turkish and Azeri
  # they upcase to a dotted-capital-I

  defp casing(string, <<byte::size(8), rest::binary>>, :downcase = casing, language, bytes_so_far, acc)
      when byte >= ?A and byte <= ?Z and byte != ?I do
    casing(string, rest, casing, language, bytes_so_far + 1, [byte + 32 | acc])
  end

  defp casing(string, <<byte::size(8), rest::binary>>, casing, language, bytes_so_far, acc)
      when casing in [:upcase, :titlecase] and byte >= ?a and byte <= ?z and byte != ?i do
    casing(string, rest, casing, language, bytes_so_far + 1, [byte - 32 | acc])
  end

  defp casing(string, <<byte::size(8), rest::binary>>, casing, language, bytes_so_far, acc)
      when casing in [:upcase, :titlecase] and byte != ?i and byte <= ?~ do
    casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc])
  end

  defp casing(string, <<byte::size(8), rest::binary>>, :downcase = casing, language, bytes_so_far, acc)
      when byte != ?I and byte <= ?~ do
    casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc])
  end

  # Generate the mapping functions

  for %{codepoint: codepoint, upper: upper} = casing <- Utils.casing_in_order(), upper && upper != codepoint && (codepoint == ?i or codepoint > ?~) do
    %{context: context, language: language} = casing

    define_casing_function.(:upcase, codepoint, upper, language, context)
  end

  for %{codepoint: codepoint, lower: lower} = casing <- Utils.casing_in_order(), lower && lower != codepoint && (codepoint == ?I or codepoint > ?~) do
    %{language: language, context: context} = casing

    # Special casing for capital sigma with no context.
    # see the default implementations of casing/5 at the
    # end of this file. Don't generate a function clause for
    # this codepoint here.
    unless codepoint == @sigma and is_nil(context) do
      define_casing_function.(:downcase, codepoint, lower, language, context)
    end
  end

  for %{codepoint: codepoint, title: title} = casing <- Utils.casing_in_order(), title && title != codepoint && codepoint > ?~ do
    %{context: context, language: language} = casing

    define_casing_function.(:titlecase, codepoint, title, language, context)
  end

  # End of string, return accumulator
  defp casing(_string, "", _casing, _language, _bytes_so_far, acc) do
    acc
    |> :lists.reverse()
    |> IO.iodata_to_binary()
  end

  # Special case for Greek sigma when no context. This is the only codepoint
  # that has two cases for the language :any. One case with "final_sigma" context
  # and one with no context. This means we can't generate two distinct function
  # clauses for casing/5 so we define a special one here for the "no context"
  # version and generate the one with the context in the normal flow.
  defp casing(string, <<@sigma::utf8, rest::binary>>, :downcase = casing, :any = language, bytes_so_far, acc) do
    bytes_so_far = bytes_so_far + @sigma_byte_size

    casing(string, rest, casing, language, bytes_so_far, [@lower_sigma | acc])
  end

  # Pass the character through since there is no casing data.
  # Optimize for ASCII bytes (byte value is less than 127)
  defp casing(string, <<byte::size(8), rest::binary>>, casing, :any = language, bytes_so_far, acc) when byte <= ?~ do
    bytes_so_far = bytes_so_far + 1

    casing(string, rest, casing, language, bytes_so_far, [byte | acc])
  end

  defp casing(string, <<next::utf8, rest::binary>>, casing, :any = language, bytes_so_far, acc) do
    next = <<next::utf8>>
    bytes_so_far = bytes_so_far + byte_size(next)

    casing(string, rest, casing, language, bytes_so_far, [next | acc])
  end

  # If the language version has no casing, use the default casing by
  # using the :any language.
  defp casing(string, rest, casing, _language, bytes_so_far, acc) do
    casing(string, rest, casing, :any, bytes_so_far, acc)
  end

  @doc false
  def unknown_locale_error(locale) do
    "Unknown locale #{inspect(locale)}"
  end
end