lib/xray.ex

defmodule Xray do
  @moduledoc """
  Xray offers utility functions for inspecting string binaries, their
  code points, and their base2 representations.

  This package was the result of my own studying of Elixir strings and binaries.
  It's unlikely you would actually use this as a dependency, but I offer it up
  for public use in the hopes that it may be educational.
  """

  @doc """
  This function prints a report on the provided input string. This may not work
  especially well when the input contains non-printable characters (YMMV).

  For each character in the string, the following information is shown:

  - code point as a decimal, e.g. `228`
  - code point in its Elixir Unicode representation, e.g. `\\u00E4`
  - a link to a page containing more information about this Unicode code point
  - count of the number of bytes required to represent this code point using UTF-8 encoding
  - an inspection of the UTF-8 binaries, e.g. `<<195, 164>>`
  - a `Base2` representation (i.e. 1's and 0's) of the encoded code point

  The `Base2` representation (what we would be tempted to call the "binary" representation)
  highlights control bits in red to help show how [UTF-8](https://en.wikipedia.org/wiki/UTF-8)
  identifies how many bytes are required to encode each character.

  ## Examples
    ```
    iex> Xray.inspect("cät")
    ======================================================
    Input String: cät
    Character Count: 3
    Byte Count: 4
    Is valid? true
    Is printable? true
    ======================================================

    c   Codepoint: 99 (\\u0063) https://codepoints.net/U+0063
        Script(s): latin
        Byte Count: 1
        UTF-8: <<99>>
        Base2: 01100011

    ä   Codepoint: 228 (\\u00E4) https://codepoints.net/U+00E4
        Script(s): latin
        Byte Count: 2
        UTF-8: <<195, 164>>
        Base2: 11000011 10100100

    t   Codepoint: 116 (\\u0074) https://codepoints.net/U+0074
        Script(s): latin
        Byte Count: 1
        UTF-8: <<116>>
        Base2: 01110100
    ```
  """
  @spec inspect(value :: binary) :: String.t()
  def inspect(value) when is_binary(value) do
    value
    |> heading()
    |> String.codepoints()
    |> Enum.map(&character_profile/1)
  end

  # This headlines the `inspect/1` output.
  defp heading(value) do
    IO.puts("======================================================")
    IO.puts("Input String: #{value}")
    IO.puts("Character Count: #{String.length(value)}")
    IO.puts("Byte Count: #{byte_size(value)}")
    IO.puts("Is valid? #{String.valid?(value)}")
    IO.puts("Is printable? #{String.printable?(value)}")
    IO.puts("======================================================")
    value
  end

  # This is our formatting card for displaying info about a single character
  defp character_profile(x) do
    IO.puts("")

    IO.puts(
      character_heading(x) <>
        "   Codepoint: #{codepoint(x)} (\\u#{codepoint(x, as_hex: true)}) #{link(x)}"
    )

    IO.puts(indent("Is printable? #{String.printable?(x)}"))
    IO.puts(indent("Script(s): #{Enum.join(Unicode.script(x), ",")}"))
    IO.puts(indent("Byte Count: #{byte_size(x)}"))
    IO.inspect(x, binaries: :as_binaries, label: "    UTF-8")
    IO.puts(indent("Base2: " <> to_base2(x)))
  end

  defp character_heading(x) do
    case String.printable?(x) do
      true -> IO.ANSI.bright() <> x <> IO.ANSI.reset()
      _ -> IO.ANSI.bright() <> "�" <> IO.ANSI.reset()
    end
  end

  @doc """
  Reveals the integer codepoint for the given single character; when run with
  the default options, this is equivalent to the question-mark operator, e.g.
  `?x` but this function works with variables (whereas the question mark only
  evaluates literal characters).

  ## Options:

  ### `:as_hex` (boolean) default: `false`

  When true, returns the hexidecimal representation of the codepoint number. The
  hexidecimal representation is useful when looking up documentation, e.g. on
  [Wikipedia](https://en.wikipedia.org/wiki/List_of_Unicode_characters) or on
  websites like [codepoints.net](https://codepoints.net/).

  ## Examples
      iex> Xray.codepoint("ä")
      228
      iex> Xray.codepoint("ä", as_hex: true)
      "00E4"
  """
  @spec codepoint(binary, opts :: keyword) :: integer | String.t()
  def codepoint(<<codepoint::utf8>>, opts \\ []) do
    case Keyword.get(opts, :as_hex) do
      true -> codepoint_as_hex(codepoint)
      _ -> codepoint
    end
  end

  @doc """
  Given a string binary, this returns a list of the codepoints that represent
  each of the characters in the string. This is what you might expect
  `String.codepoints/1` to return, but instead of returning a list of the
  component *characters*, this function returns the *numbers* (which is what
  code points are).

  Note that this function returns a *string*: if a list is returned, Elixir will
  usually attempt to format it as a human-readable string, which defeats the
  purpose of the inspection.

  This function offers output similar to what `IO.inspect/2` when the `:as_lists`
  option set to `true`

  ## Options

  - `:as_hex` (see `codepoint/2`)

  ## Examples

      iex> Xray.codepoints("cät")
      "99, 228, 116"

  Compare this to inspecting a single-quoted charlist:

      iex> IO.inspect('cät', charlists: :as_lists)
      [99, 228, 116]

  But `IO.inspect` will send output to STDOUT.
  """
  @spec codepoints(string :: binary, opts :: keyword) :: list
  def codepoints(string, opts \\ []) when is_binary(string) do
    string
    |> String.codepoints()
    |> Enum.map(fn x -> codepoint(x, opts) end)
    |> Enum.join(", ")
  end

  # Converts a character like ä to its hexidecimal representation like `00E4`
  defp codepoint_as_hex(codepoint) do
    list = Integer.to_charlist(codepoint, 16)
    String.pad_leading(to_string(list), 4, ["0"])
  end

  # Simple indentation
  defp indent(string) do
    "    " <> string
  end

  # get a link to some official documentation about this codepoint
  # e.g. https://codepoints.net/U+00E4
  defp link(<<codepoint::utf8>>) do
    slug = "U+#{codepoint_as_hex(codepoint)}"
    IO.ANSI.blue() <> "https://codepoints.net/#{slug}" <> IO.ANSI.reset()
  end

  defp to_base2(x) when is_binary(x) do
    base2 = Base2.encode2(x, padding: :all)
    parse_bytes(base2, "") <> IO.ANSI.reset()
  end

  # Some nice formatting of the base2 representation highlighting the control bits that
  # flag multi-byte characters in UTF-8.
  # Start of 4-byte encoding of codepoint
  defp parse_bytes("11110" <> <<byte_rest::binary-size(3), other_bytes::binary>>, acc) do
    parse_bytes(
      other_bytes,
      acc <> IO.ANSI.red() <> "11110" <> IO.ANSI.green() <> byte_rest <> " "
    )
  end

  # Start of 3-byte encoding of codepoint
  defp parse_bytes("1110" <> <<byte_rest::binary-size(4), other_bytes::binary>>, acc) do
    parse_bytes(
      other_bytes,
      acc <> IO.ANSI.red() <> "1110" <> IO.ANSI.green() <> byte_rest <> " "
    )
  end

  # Start of 2-byte encoding of codepoint
  defp parse_bytes("110" <> <<byte_rest::binary-size(5), other_bytes::binary>>, acc) do
    parse_bytes(other_bytes, acc <> IO.ANSI.red() <> "110" <> IO.ANSI.green() <> byte_rest <> " ")
  end

  # Start of secondary byte
  defp parse_bytes("10" <> <<byte_rest::binary-size(6), other_bytes::binary>>, acc) do
    parse_bytes(other_bytes, acc <> IO.ANSI.red() <> "10" <> IO.ANSI.green() <> byte_rest <> " ")
  end

  # Start of a single byte codepoint
  defp parse_bytes("0" <> <<byte_rest::binary-size(7), other_bytes::binary>>, acc) do
    parse_bytes(other_bytes, acc <> IO.ANSI.red() <> "0" <> IO.ANSI.green() <> byte_rest <> " ")
  end

  # All done!
  defp parse_bytes("", acc), do: acc
end