lib/unicode.ex

Select File:
defmodule Unicode do
  @moduledoc """
  Functions to introspect the Unicode character database and
  to provide fast codepoint lookups for scripts, blocks,
  categories and properties.

  """
  alias Unicode.Utils

  @typedoc "A codepoint is an integer representing a Unicode character"
  @type codepoint :: non_neg_integer

  @typedoc "A codepoint or a string"
  @type codepoint_or_string :: codepoint | String.t()

  @typedoc "Unicode UTF encodings"
  @type encoding :: :utf8 | :utf16 | :utf16be | :utf16le | :utf32 | :utf32be | :utf32le

  @typedoc "The valid scripts as of Unicode 15"
  @type script ::
    :tangsa | :runic | :greek | :myanmar | :cherokee | :palmyrene | :elymaic | :latin,
    :kannada | :deseret | :old_hungarian | :psalter_pahlavi | :tagbanwa | :wancho,
    :khmer | :bengali | :soyombo | :chakma | :inscriptional_pahlavi | :carian,
    :tai_viet | :georgian | :oriya | :meroitic_cursive | :meroitic_hieroglyphs,
    :braille | :nandinagari | :vai | :adlam | :mahajani | :tirhuta | :mro,
    :zanabazar_square | :cuneiform | :vithkuqi | :newa | :yezidi | :osage | :linear_a,
    :hiragana | :mende_kikakui | :cyrillic | :hatran | :anatolian_hieroglyphs | :limbu,
    :balinese | :ethiopic | :new_tai_lue | :dives_akuru | :old_uyghur | :saurashtra,
    :linear_b | :mandaic | :tibetan | :caucasian_albanian | :avestan | :tangut,
    :siddham | :duployan | :kawi | :common | :thai | :shavian | :tamil | :old_persian,
    :nag_mundari | :ol_chiki | :samaritan | :tagalog | :grantha | :gujarati | :ugaritic,
    :khitan_small_script | :nyiakeng_puachue_hmong | :buhid | :syriac | :old_sogdian,
    :khudawadi | :lepcha | :lycian | :phags_pa | :bopomofo | :old_permic | :phoenician,
    :katakana | :dogra | :javanese | :glagolitic | :tai_le | :old_turkic,
    :old_south_arabian | :takri | :inscriptional_parthian | :signwriting | :osmanya,
    :syloti_nagri | :sogdian | :egyptian_hieroglyphs | :gunjala_gondi | :sora_sompeng,
    :arabic | :modi | :inherited | :chorasmian | :manichaean | :medefaidrin,
    :imperial_aramaic | :nko | :cypriot | :bamum | :han | :masaram_gondi | :ahom,
    :hanifi_rohingya | :coptic | :lao | :cham | :malayalam | :lisu | :yi | :old_italic,
    :gothic | :cypro_minoan | :pau_cin_hau | :canadian_aboriginal | :mongolian,
    :sharada | :tai_tham | :hanunoo | :old_north_arabian | :lydian | :rejang,
    :warang_citi | :kharoshthi | :brahmi | :sinhala | :batak | :telugu | :gurmukhi,
    :kayah_li | :marchen | :pahawh_hmong | :armenian | :bassa_vah | :multani,
    :nabataean | :toto | :hangul | :devanagari | :khojki | :kaithi | :thaana | :nushu,
    :sundanese | :bhaiksuki | :ogham | :makasar | :elbasan | :miao | :meetei_mayek,
    :hebrew | :buginese | :tifinagh

  @doc false
  @data_dir Path.join(__DIR__, "../data") |> Path.expand()
  def data_dir do
    @data_dir
  end

  @doc """
  Returns the version of Unicode in use.

  """
  @version File.read!("data/blocks.txt")
           |> String.split("\n")
           |> Enum.at(0)
           |> String.replace("# Blocks-", "")
           |> String.replace(".txt", "")
           |> String.split(".")
           |> Enum.map(&String.to_integer/1)
           |> List.to_tuple()

  def version do
    @version
  end

  @doc """
  Ensures that a binary is valid UTF encoded.

  The string is validated by replacing any invalid UTF
  bytes or incomplete sequences with a replacement string.

  ### Arguments

  * `binary` is any sequence of bytes.

  * `encoding` is any UTF encoding being one of
    `:utf8`, `:utf16`, `:utf16be`, `:utf16le`, `:utf32`, `:utf32be` or
    `:utf32le`. The default is `:utf8`.

  * `replacement` is any string that will be used to replace
    invalid UTF-8 bytes or incomplete sequences. The default
    is `"�"`.

  ### Returns

  * A valid UTF binary that may or may not include
    replacements for invalid UTF. If `encoding` is `:utf8`
    then the return type is a `t:String.t/0`.

  ### Example

      iex> Unicode.replace_invalid(<<"foo", 0b11111111, "bar">>, :utf8)
      "foo�bar"

  """
  @doc since: "1.18.0"
  @spec replace_invalid(binary :: binary(), encoding :: encoding(), replacement :: String.t()) :: binary()
  defdelegate replace_invalid(string, encoding \\ :utf8, replacement \\ "�"), to: Unicode.Validation

  @doc """
  Returns a map of aliases mapping
  property names to a module that
  serves that property

  """
  def property_servers do
    Unicode.Property.servers()
  end

  @doc false
  def fetch_property(property) when is_binary(property) do
    Map.fetch(property_servers(), Utils.downcase_and_remove_whitespace(property))
  end

  @doc false
  def get_property(property) when is_binary(property) do
    Map.get(property_servers(), Utils.downcase_and_remove_whitespace(property))
  end

  @doc """
  Returns the Unicode category for a codepoint or a list of
  categories for a string.

  ## Argument

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * in the case of a single codepoint, an atom representing
    one of the categories listed below

  * in the case of a string, a list representing the
    category for each codepoint in the string

  ## Notes

  THese categories match the names of the Unicode character
  classes used in various regular expression engines and in
  Unicode Sets.  The full list of categories is:

  | Category	| Matches	                |
  | --------- | ----------------------- |
  | :C	      | Other	                  |
  | :Cc	      | Control	                |
  | :Cf	      | Format	                |
  | :Cn	      | Unassigned	            |
  | :Co	      | Private use	            |
  | :Cs	      | Surrogate	              |
  | :L	      | Letter	                |
  | :Ll	      | Lower case letter	      |
  | :Lm	      | Modifier letter	        |
  | :Lo	      | Other letter	          |
  | :Lt	      | Title case letter	      |
  | :Lu	      | Upper case letter	      |
  | :M	      | Mark	                  |
  | :Mc	      | Spacing mark	          |
  | :Me	      | Enclosing mark	        |
  | :Mn	      | Non-spacing mark	      |
  | :N	      | Number	                |
  | :Nd	      | Decimal number	        |
  | :Nl	      | Letter number	          |
  | :No	      | Other number	          |
  | :P	      | Punctuation	            |
  | :Pc	      | Connector punctuation	  |
  | :Pd	      | Dash punctuation	      |
  | :Pe	      | Close punctuation	      |
  | :Pf	      | Final punctuation	      |
  | :Pi	      | Initial punctuation	    |
  | :Po	      | Other punctuation	      |
  | :Ps	      | Open punctuation	      |
  | :S	      | Symbol	                |
  | :Sc	      | Currency symbol	        |
  | :Sk	      | Modifier symbol	        |
  | :Sm	      | Mathematical symbol	    |
  | :So	      | Other symbol	          |
  | :Z	      | Separator	              |
  | :Zl	      | Line separator	        |
  | :Zp	      | Paragraph separator	    |
  | :Zs	      | Space separator	        |

  Note too that the group level categories like `:L`,
  `:M`, `:S` and so on are not assigned to any codepoint.
  They can only be identified by combining the results
  for each of the subsidiary categories.

  ## Examples

      iex> Unicode.category ?ä
      :Ll

      iex> Unicode.category ?A
      :Lu

      iex> Unicode.category ?🧐
      :So

      iex> Unicode.category ?+
      :Sm

      iex> Unicode.category ?1
      :Nd

      iex> Unicode.category "äA"
      [:Ll, :Lu]

  """
  @spec category(codepoint_or_string) :: atom | [atom, ...]
  defdelegate category(codepoint_or_string), to: Unicode.GeneralCategory

  @doc """
  Returns the script name of a codepoint
  or the list of block names for each codepoint
  in a string.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * in the case of a single codepoint, a string
    script name

  * in the case of a string, a list of string
    script names for each codepoint in the
  ` codepoint_or_string`

  ## Exmaples

      iex> Unicode.script ?ä
      :latin

      iex> Unicode.script ?خ
      :arabic

      iex> Unicode.script ?अ
      :devanagari

      iex> Unicode.script ?א
      :hebrew

      iex> Unicode.script ?Ж
      :cyrillic

      iex> Unicode.script ?δ
      :greek

      iex> Unicode.script ?ก
      :thai

      iex> Unicode.script ?ယ
      :myanmar

  """
  @spec script(codepoint_or_string) :: String.t() | [String.t(), ...]
  defdelegate script(codepoint_or_string), to: Unicode.Script

  @doc """
  Returns the block name of a codepoint
  or the list of block names for each codepoint
  in a string.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * in the case of a single codepoint, an atom
    block name

  * in the case of a string, a list of atom
    block names for each codepoint in the
   `codepoint_or_string`

  ## Exmaples

      iex> Unicode.block ?ä
      :latin_1_supplement

      iex> Unicode.block ?A
      :basic_latin

      iex> Unicode.block "äA"
      [:latin_1_supplement, :basic_latin]

  """
  @spec block(codepoint_or_string) :: atom | [atom, ...]
  defdelegate block(codepoint_or_string), to: Unicode.Block

  @doc """
  Returns the list of properties of each codepoint
  in a given string or the list of properties for a
  given string.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * in the case of a single codepoint, an atom
    list of properties

  * in the case of a string, a list of atom
    lisr for each codepoint in the
  ` codepoint_or_string`

  ## Exmaples

      iex> Unicode.properties 0x1bf0
      [
        :alphabetic,
        :case_ignorable,
        :grapheme_extend,
        :id_continue,
        :other_alphabetic,
        :xid_continue
      ]

      iex> Unicode.properties ?A
      [
        :alphabetic,
        :ascii_hex_digit,
        :cased,
        :changes_when_casefolded,
        :changes_when_casemapped,
        :changes_when_lowercased,
        :grapheme_base,
        :hex_digit,
        :id_continue,
        :id_start,
        :uppercase,
        :xid_continue,
        :xid_start
      ]

      iex> Unicode.properties ?+
      [:grapheme_base, :math, :pattern_syntax]

      iex> Unicode.properties "a1+"
      [
        [
          :alphabetic,
          :ascii_hex_digit,
          :cased,
          :changes_when_casemapped,
          :changes_when_titlecased,
          :changes_when_uppercased,
          :grapheme_base,
          :hex_digit,
          :id_continue,
          :id_start,
          :lowercase,
          :xid_continue,
          :xid_start
        ],
        [
          :ascii_hex_digit,
          :emoji,
          :emoji_component,
          :grapheme_base,
          :hex_digit,
          :id_continue,
          :xid_continue
        ],
        [
          :grapheme_base,
          :math,
          :pattern_syntax
        ]
      ]

  """
  @spec properties(codepoint_or_string) :: [atom, ...] | [[atom, ...], ...]
  defdelegate properties(codepoint_or_string), to: Unicode.Property

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters in the
  given string) adhere to the Derived Core Property `Alphabetic`
  otherwise returns `false`.

  These are all characters that are usually used as representations
  of letters/syllabes in words/sentences.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

      iex> Unicode.alphabetic?(?a)
      true

      iex> Unicode.alphabetic?("A")
      true

      iex> Unicode.alphabetic?("Elixir")
      true

      iex> Unicode.alphabetic?("الإكسير")
      true

      # comma and whitespace
      iex> Unicode.alphabetic?("foo, bar")
      false

      iex> Unicode.alphabetic?("42")
      false

      iex> Unicode.alphabetic?("龍王")
      true

      # Summation, \u2211
      iex> Unicode.alphabetic?("∑")
      false

      # Greek capital letter sigma, \u03a3
      iex> Unicode.alphabetic?("Σ")
      true

  """
  @spec alphabetic?(codepoint_or_string) :: boolean
  defdelegate alphabetic?(codepoint_or_string), to: Unicode.Property

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) are either `alphabetic?/1` or
  `numeric?/1` otherwise returns `false`.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ### Examples

      iex> Unicode.alphanumeric? "1234"
      true

      iex> Unicode.alphanumeric? "KeyserSöze1995"
      true

      iex> Unicode.alphanumeric? "3段"
      true

      iex> Unicode.alphanumeric? "dragon@example.com"
      false

  """
  @spec alphanumeric?(codepoint_or_string) :: boolean
  defdelegate alphanumeric?(codepoint_or_string), to: Unicode.Property

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) adhere to Unicode category `:Nd`
  otherwise returns `false`.

  This group of characters represents the decimal digits zero
  through nine (0..9) and the equivalents in non-Latin scripts.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

  """
  @spec digits?(codepoint_or_string) :: boolean
  defdelegate digits?(codepoint_or_string), to: Unicode.Property, as: :numeric?

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) adhere to Unicode categories `:Nd`,
  `:Nl` and `:No` otherwise returns `false`.

  This group of characters represents the decimal digits zero
  through nine (0..9) and the equivalents in non-Latin scripts.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

      iex> Unicode.numeric?("65535")
      true

      iex> Unicode.numeric?("42")
      true

      iex> Unicode.numeric?("lapis philosophorum")
      false

  """
  @spec numeric?(codepoint_or_string) :: boolean
  defdelegate numeric?(codepoint_or_string), to: Unicode.Property, as: :extended_numeric?

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) are `emoji` otherwise returns `false`.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ### Examples

      iex> Unicode.emoji? "🧐🤓🤩🤩️🤯"
      true

  """
  @spec emoji?(codepoint_or_string) :: boolean
  defdelegate emoji?(codepoint_or_string), to: Unicode.Emoji

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) the category `:Sm` otherwise returns `false`.

  These are all characters whose primary usage is in mathematical
  concepts (and not in alphabets). Notice that the numerical digits
  are not part of this group.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

      iex> Unicode.math?(?=)
      true

      iex> Unicode.math?("=")
      true

      iex> Unicode.math?("1+1=2") # Digits do not have the `:math` property.
      false

      iex> Unicode.math?("परिस")
      false

      iex> Unicode.math?("∑") # Summation, \\u2211
      true

      iex> Unicode.math?("Σ") # Greek capital letter sigma, \\u03a3
      false

  """
  @spec math?(codepoint_or_string) :: boolean
  defdelegate math?(codepoint_or_string), to: Unicode.Property

  @doc """
  Returns either `true` if the codepoint has the `:cased` property
  or `false`.

  The `:cased` property means that this character has at least
  an upper and lower representation and possibly a titlecase
  representation too.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

      iex> Unicode.cased? ?ယ
      false

      iex> Unicode.cased? ?A
      true

  """
  @spec cased?(codepoint_or_string) :: boolean
  defdelegate cased?(codepoint_or_string), to: Unicode.Property

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) the category `:Ll` otherwise returns `false`.

  Notice that there are many languages that do not have a distinction
  between cases. Their characters are not included in this group.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.

  ## Examples

      iex> Unicode.lowercase?(?a)
      true

      iex> Unicode.lowercase?("A")
      false

      iex> Unicode.lowercase?("Elixir")
      false

      iex> Unicode.lowercase?("léon")
      true

      iex> Unicode.lowercase?("foo, bar")
      false

      iex> Unicode.lowercase?("42")
      false

      iex> Unicode.lowercase?("Σ")
      false

      iex> Unicode.lowercase?("σ")
      true

  """
  @spec lowercase?(codepoint_or_string) :: boolean
  defdelegate lowercase?(codepoint_or_string), to: Unicode.Property
  defdelegate downcase?(codepoint_or_string), to: Unicode.Property, as: :lowercase?

  @doc """
  Returns `true` if a single Unicode codepoint (or all characters
  in the given string) the category `:Lu` otherwise returns `false`.

  Notice that there are many languages that do not have a distinction
  between cases. Their characters are not included in this group.

  ## Arguments

  * `codepoint_or_string` is a single integer codepoint
    or a `String.t`.

  ## Returns

  * `true` or `false`

  For the string-version, the result will be true only if _all_
  codepoints in the string adhere to the property.
  ## Examples

      iex> Unicode.uppercase?(?a)
      false

      iex> Unicode.uppercase?("A")
      true

      iex> Unicode.uppercase?("Elixir")
      false

      iex> Unicode.uppercase?("CAMEMBERT")
      true

      iex> Unicode.uppercase?("foo, bar")
      false

      iex> Unicode.uppercase?("42")
      false

      iex> Unicode.uppercase?("Σ")
      true

      iex> Unicode.uppercase?("σ")
      false

  """
  @spec uppercase?(codepoint_or_string) :: boolean
  defdelegate uppercase?(codepoint_or_string), to: Unicode.Property
  defdelegate upcase?(codepoint_or_string), to: Unicode.Property, as: :uppercase?

  @doc """
  Returns a list of tuples representing the
  assigned ranges of Unicode code points.

  This information is derived from the block
  ranges as defined by `Unicode.Block.blocks/0`.

  """
  @spec assigned :: [{pos_integer, pos_integer}]
  defdelegate assigned, to: Unicode.Block, as: :assigned

  @deprecated "Use Unicode.assigned/0"
  def ranges do
    assigned()
  end

  @doc """
  Returns a list of tuples representing the
  full range of Unicode code points.

  """
  @all [{0x0, 0x10FFFF}]

  @spec all :: [{0x0, 0x10FFFF}]
  def all do
    @all
  end

  @doc """
  Removes accents (diacritical marks) from
  a string.

  ## Arguments

  * `string` is any `String.t`

  ## Returns

  * A string with all diacritical marks
    removed

  ## Notes

  The string is first normalised to `:nfd` form
  and then all characters in the block
  `:comnbining_diacritical_marks` is removed
  from the string

  ## Example

      iex> Unicode.unaccent("Et Ça sera sa moitié.")
      "Et Ca sera sa moitie."

  """
  def unaccent(string) do
    string
    |> normalize_nfd
    |> String.to_charlist()
    |> remove_diacritical_marks([:combining_diacritical_marks])
    |> List.to_string()
  end

  defp remove_diacritical_marks(charlist, blocks) do
    Enum.reduce(charlist, [], fn char, acc ->
      if Unicode.Block.block(char) in blocks do
        acc
      else
        [char | acc]
      end
    end)
    |> Enum.reverse()
  end

  @doc """
  Returns the first index and grapheme count of each
  script detected in a string.

  ## Arguments

  * `string` is any `String.t`.

  ## Returns

  * A map where the key is a `t:script/0` and the value
    is a tuple where the first element is the index in the
    string where that script first appeared and the second
    element is the number of graphemes in that script.

  ## Examples

      iex> Unicode.script_statistic "Tokyo is the capital of 日本"
      %{common: {5, 5}, han: {24, 2}, latin: {0, 19}}

      iex> Unicode.script_statistic "おはよう"
      %{hiragana: {0, 4}}

  """
  @doc since: "1.16.0"

  @spec script_statistic(String.t()) :: %{script() => {non_neg_integer, pos_integer}}
  def script_statistic(string) when is_binary(string) do
    string
    |> String.graphemes()
    |> Enum.reduce({0, Map.new()}, fn grapheme, {index, map} ->
      [script] = Unicode.script(grapheme)
      map = Map.update(map, script, {index, 1}, fn {loc, count} -> {loc, count + 1} end)
      {index + 1, map}
    end)
    |> elem(1)
  end

  @doc """
  Returns a keyword list of scripts in descending dominance
  order for a given string.

  Dominance is determined by (in order of priority):

  * Index of the first occurrence of the script
  * Count of the number of graphemes in the script
  * Lexical ordering of the script name (used as a final means
    to ensure returning a deterministic result).

  ## Arguments

  * `string` is any `String.t`.

  ## Returns

  * A keyword list where the key is a `t:script/0` and the value
    is a tuple where the first element is the index in the
    string where that script first appeared and the second
    element is the number of graphemes in that script. The list
    is ordered by descending dominance.

  ## Example

      iex> Unicode.script_dominance "Tokyo is the capital of 日本"
      [latin: {0, 19}, common: {5, 5}, han: {24, 2}]

      iex> Unicode.script_dominance "おはよう"
      [hiragana: {0, 4}]

  """
  @doc since: "1.16.0"

  @spec script_dominance(String.t()) :: [{script(), {non_neg_integer, pos_integer}}]
  def script_dominance(string) do
    string
    |> script_statistic()
    |> Enum.sort(fn
      {script_1, {index_1, count_1}}, {script_2, {index_1, count_1}} ->
        to_string(script_1) < to_string(script_2)

      {_script_1, {index_1, count_1}}, {_script_2, {index_1, count_2}} ->
        count_1 < count_2

      {_script_1, {index_1, _count_1}}, {_script_2, {index_2, _count_2}} ->
        index_1 < index_2
    end)
  end

  @doc false
  def compact_ranges([{as, ae}, {bs, be} | rest]) when ae >= bs - 1 and as <= be do
    compact_ranges([{as, be} | rest])
  end

  def compact_ranges([{as, ae}, {_bs, be} | rest]) when ae >= be do
    compact_ranges([{as, ae} | rest])
  end

  def compact_ranges([first]) do
    [first]
  end

  def compact_ranges([first | rest]) do
    [first | compact_ranges(rest)]
  end

  # OTP 20 introduced the `:unicode: module
  # but we also want to support earlier
  # versions

  @doc false
  if Code.ensure_loaded?(:unicode) do
    def normalize_nfd(string) do
      :unicode.characters_to_nfd_binary(string)
    end
  else
    def normalize_nfd(string) do
      String.normalize(string, :nfd)
    end
  end
end