lib/any_ascii.ex

defmodule AnyAscii do
  @moduledoc """
  Unicode to ASCII transliteration

  Converts Unicode characters to their best ASCII representation

  AnyAscii provides ASCII-only replacement strings for practically all Unicode
  characters. Text is converted character-by-character without considering the
  context. The mappings for each script are based on popular existing
  romanization systems. Symbolic characters are converted based on their meaning
  or appearance. All ASCII characters in the input are left unchanged, every
  other character is replaced with printable ASCII characters. Unknown
  characters and some known characters are replaced with an empty string and
  removed.
  """

  import Bitwise

  @doc """
  Transliterates chardata into ASCII.

  ## Examples

      iex> AnyAscii.transliterate("άνθρωποι") |> IO.iodata_to_binary()
      "anthropoi"

      iex> AnyAscii.transliterate('Борис') |> IO.iodata_to_binary()
      "Boris"

      iex> AnyAscii.transliterate([?深]) |> IO.iodata_to_binary()
      "Shen"

  """
  @spec transliterate(IO.chardata()) :: IO.chardata()
  def transliterate(chardata)

  def transliterate([c | t]) when c in 0..0x7F,
    do: [c | transliterate(t)]

  def transliterate([c | t]) when c in 0x80..0x10FFFF,
    do: [transliterate_char(c) | transliterate(t)]

  def transliterate([h | t]),
    do: [transliterate(h) | transliterate(t)]

  def transliterate([]),
    do: []

  def transliterate(<<s::binary>>),
    do: transliterate(String.to_charlist(s))

  defp transliterate_char(c) do
    block_num = c >>> 8
    lo = c &&& 0xFF
    block = get_block(block_num)
    if lo < tuple_size(block), do: elem(block, lo), else: []
  end

  defp get_block(block_num) do
    key = {__MODULE__, block_num}

    case :persistent_term.get(key, nil) do
      nil ->
        b = read_block(block_num)
        :persistent_term.put(key, b)
        b

      b ->
        b
    end
  end

  defp read_block(block_num) do
    path = Path.join([Application.app_dir(:any_ascii), "priv", Integer.to_string(block_num)])

    if File.exists?(path) do
      path
      |> File.read!()
      |> :zlib.unzip()
      |> String.split("\t")
      |> Enum.map(&minimize_string/1)
      |> List.to_tuple()
    else
      {}
    end
  end

  defp minimize_string(s) do
    case s do
      "" -> []
      <<c>> -> c
      _ -> s
    end
  end
end