lib/unicode/string.ex

defmodule Unicode.String do
  @moduledoc """
  This module provides functions that implement somee
  of the [Unicode](https://unicode.org) stanards:

  * The [Unicode Case Folding](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf) algorithm
    to provide case-independent equality checking irrespective of language or script.

  * The [Unicode Segmentation](https://unicode.org/reports/tr29/) algorithm to detect,
    break or splut strings into grapheme clusters, works and sentences.

  """

  alias Unicode.String.Segment
  alias Unicode.String.Break
  alias Unicode.Property

  defdelegate fold(string), to: Unicode.String.Case.Folding
  defdelegate fold(string, type), to: Unicode.String.Case.Folding

  @type string_interval :: {String.t, String.t}
  @type break_type :: :grapheme | :word | :line | :sentence
  @type error_return :: {:error, String.t}

  @type options :: [
    {:locale, String.t},
    {:break, break_type},
    {:suppressions, boolean}
  ]

  @type split_options :: [
    {:locale, String.t},
    {:break, break_type},
    {:suppressions, boolean},
    {:trim, boolean}
  ]

  @type break_or_no_break :: :break | :no_break

  @type break_match ::
    {break_or_no_break, {String.t, {String.t, String.t}}} |
    {break_or_no_break, {String.t, String.t}}

  @doc """
  Compares two strings in a case insensitive
  manner.

  Case folding is applied to the two string
  arguments which are then compared with the
  `==` operator.

  ## Arguments

  * `string_a` and `string_b` are two strings
    to be compared

  * `type` is the case folding type to be
    applied. The alternatives are `:full`,
    `:simple` and `:turkic`.  The default is
    `:full`.

  ## Returns

  * `true` or `false`

  ## Notes

  * This function applies the [Unicode Case Folding
    algorithm](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf)

  * The algorithm does not apply any treatment to diacritical
    marks hence "compare strings without accents" is not
    part of this function.

  ## Examples

      iex> Unicode.String.equals_ignoring_case? "ABC", "abc"
      true

      iex> Unicode.String.equals_ignoring_case? "beißen", "beissen"
      true

      iex> Unicode.String.equals_ignoring_case? "grüßen", "grussen"
      false

  """
  @spec equals_ignoring_case?(String.t, String.t, atom()) :: boolean
  def equals_ignoring_case?(string_a, string_b, type \\ :full) do
    fold(string_a, type) == fold(string_b, type)
  end

  @default_locale "root"

  @doc """
  Returns a boolean indicating if the
  requested break is applicable
  at the point between the two string
  segments represented by `{string_before, string_after}`.

  ## Arguments

  * `string` is any `String.t`.

  * `options` is a keyword list of
    options.

  ## Returns

  * `true` or `false` or

  * raises an exception if there is an error

  ## Options

  * `:locale` is any locale returned by
    `Unicode.String.Segment.known_locales/0`.
    The default is "root" which corresponds
    to the break rules defined by the
    [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.

  * `:break` is the type of break. It is one of
    `:grapheme`, `:word`, `:line` or `:sentence`. The
    default is `:word`.

  * `:suppressions` is a boolean which,
    if `true`, will suppress breaks for common
    abbreviations defined for the `locale`. The
    default is `true`.

  ## Examples

      iex> Unicode.String.break? {"This is ", "some words"}
      true

      iex> Unicode.String.break? {"This is ", "some words"}, break: :sentence
      false

      iex> Unicode.String.break? {"This is one. ", "This is some words."}, break: :sentence
      true

  """
  @spec break?(string_interval, options) :: boolean
  def break?({string_before, string_after}, options \\ []) do
    case break({string_before, string_after}, options) do
      {:break, _} -> true
      {:no_break, _} -> false
      {:error, reason} -> raise ArgumentError, reason
    end
  end

  @doc """
  Returns match data indicating if the
  requested break is applicable
  at the point between the two string
  segments represented by `{string_before, string_after}`.

  ## Arguments

  * `string` is any `String.t`.

  * `options` is a keyword list of
    options.

  ## Returns

  A tuple indicating if a break would
  be applicable at this point between
  `string_before` and `string_after`.

  * `{:break, {string_before, {matched_string, remaining_string}}}` or

  * `{:no_break, {string_before, {matched_string, remaining_string}}}` or

  * `{:error, reason}`

  ## Options

  * `:locale` is any locale returned by
    `Unicode.String.Segment.known_locales/0`.
    The default is "root" which corresponds
    to the break rules defined by the
    [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.

  * `:break` is the type of break. It is one of
    `:grapheme`, `:word`, `:line` or `:sentence`. The
    default is `:word`.

  * `:suppressions` is a boolean which,
    if `true`, will suppress breaks for common
    abbreviations defined for the `locale`. The
    default is `true`.

  ## Examples

      iex> Unicode.String.break {"This is ", "some words"}
      {:break, {"This is ", {"s", "ome words"}}}

      iex> Unicode.String.break {"This is ", "some words"}, break: :sentence
      {:no_break, {"This is ", {"s", "ome words"}}}

      iex> Unicode.String.break {"This is one. ", "This is some words."}, break: :sentence
      {:break, {"This is one. ", {"T", "his is some words."}}}

  """
  @spec break(string_interval, options) :: break_match | error_return
  def break({string_before, string_after}, options \\ []) do
    locale = Keyword.get(options, :locale, @default_locale)
    break = Keyword.get(options, :break, :word)

    with {:ok, break} <- validate(:break, break),
         {:ok, locale} <- validate(:locale, locale) do
      Break.break({string_before, string_after}, locale, break, options)
    end
  end

  @doc """
  Returns an enumerable that splits a string on demand.

  ## Arguments

  * `string` is any `String.t`.

  * `options` is a keyword list of
    options.

  ## Returns

  * A function that implements the enumerable
    protocol or

  * `{:error, reason}`

  ## Options

  * `:locale` is any locale returned by
    `Unicode.String.Segment.known_locales/0`.
    The default is "root" which corresponds
    to the break rules defined by the
    [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.

  * `:break` is the type of break. It is one of
    `:grapheme`, `:word`, `:line` or `:sentence`. The
    default is `:word`.

  * `:suppressions` is a boolean which,
    if `true`, will suppress breaks for common
    abbreviations defined for the `locale`. The
    default is `true`.

  * `:trim` is a boolean indicating if segments
    the are comprised of only white space are to be
    excluded from the returned list.  The default
    is `false`.

  ## Examples

      iex> enum = Unicode.String.splitter "This is a sentence. And another.", break: :word, trim: true
      iex> Enum.take enum, 3
      ["This", "is", "a"]

  """
  @spec splitter(String.t, split_options) :: function | error_return
  def splitter(string, options) when is_binary(string) do
    locale = Keyword.get(options, :locale, @default_locale)
    break = Keyword.get(options, :break, :word)

    with {:ok, break} <- validate(:break, break),
         {:ok, locale} <- validate(:locale, locale) do
      Stream.unfold(string, &Break.next(&1, locale, break, options))
    end
  end

  @doc """
  Returns next segment in a string.

  ## Arguments

  * `string` is any `String.t`.

  * `options` is a keyword list of
    options.

  ## Returns

  A tuple with the segment and the remainder of the string or `""`
  in case the String reached its end.

  * `{next_string, rest_of_the_string}` or

  * `{:error, reason}`

  ## Options

  * `:locale` is any locale returned by
    `Unicode.String.Segment.known_locales/0`.
    The default is "root" which corresponds
    to the break rules defined by the
    [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.

  * `:break` is the type of break. It is one of
    `:grapheme`, `:word`, `:line` or `:sentence`. The
    default is `:word`.

  * `:suppressions` is a boolean which,
    if `true`, will suppress breaks for common
    abbreviations defined for the `locale`. The
    default is `true`.

  ## Examples

      iex> Unicode.String.next "This is a sentence. And another.", break: :word
      {"This", " is a sentence. And another."}

      iex> Unicode.String.next "This is a sentence. And another.", break: :sentence
      {"This is a sentence. ", "And another."}

  """
  @spec next(String.t, split_options) :: String.t | nil | error_return
  def next(string, options \\ []) when is_binary(string) do
    locale = Keyword.get(options, :locale, @default_locale)
    break = Keyword.get(options, :break, :word)

    with {:ok, break} <- validate(:break, break),
         {:ok, locale} <- validate(:locale, locale) do
      Break.next(string, locale, break, options)
    end
  end

  @doc """
  Splits a string according to the
  specified break type.

  ## Arguments

  * `string` is any `String.t`.

  * `options` is a keyword list of
    options.

  ## Returns

  * A list of strings after applying the
    specified break rules or

  * `{:error, reason}`

  ## Options

  * `:locale` is any locale returned by
    `Unicode.String.Segment.known_locales/0`.
    The default is "root" which corresponds
    to the break rules defined by the
    [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.

  * `:break` is the type of break. It is one of
    `:grapheme`, `:word`, `:line` or `:sentence`. The
    default is `:word`.

  * `:suppressions` is a boolean which,
    if `true`, will suppress breaks for common
    abbreviations defined for the `locale`. The
    default is `true`.

  * `:trim` is a boolean indicating if segments
    the are comprised of only white space are to be
    excluded fromt the returned list.  The default
    is `false`.

  ## Examples

      iex> Unicode.String.split "This is a sentence. And another.", break: :word
      ["This", " ", "is", " ", "a", " ", "sentence", ".", " ", "And", " ", "another", "."]

      iex> Unicode.String.split "This is a sentence. And another.", break: :word, trim: true
      ["This", "is", "a", "sentence", ".", "And", "another", "."]

      iex> Unicode.String.split "This is a sentence. And another.", break: :sentence
      ["This is a sentence. ", "And another."]

  """
  @spec split(String.t, split_options) :: [String.t, ...] | error_return
  def split(string, options \\ []) when is_binary(string) do
    locale = Keyword.get(options, :locale, @default_locale)
    break = Keyword.get(options, :break, :word)

    with {:ok, break} <- validate(:break, break),
         {:ok, locale} <- validate(:locale, locale) do
      Break.split(string, locale, break, options)
    end
    |> maybe_trim(options[:trim])
  end

  defp maybe_trim(list, true) when is_list(list) do
    Enum.reject(list, &Property.white_space?/1)
  end

  defp maybe_trim(list, _) do
    list
  end

  defp validate(:locale, locale) do
    if locale in Segment.known_locales() do
      {:ok, locale}
    else
      {:error, Segment.unknown_locale_error(locale)}
    end
  end

  @breaks [:word, :grapheme, :line, :sentence]

  defp validate(:break, break) do
    if break in @breaks do
      {:ok, break}
    else
      {:error, "Unknown break #{inspect break}. Valid breaks are #{inspect @breaks}"}
    end
  end

end