lib/pdf/reader/encoding/mac_roman.ex

defmodule Pdf.Reader.Encoding.MacRoman do
  @moduledoc """
  Mac OS Roman (MacRomanEncoding) byte-to-Unicode codepoint table.

  Used by PDF readers to decode single-byte character codes for fonts
  that specify `/Encoding /MacRomanEncoding` (or omit an encoding and
  use a Mac-origin Type 1 font).

  The table is generated at compile time from `priv/mac_roman.txt`,
  which is the canonical mapping published by Apple at
  <https://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/ROMAN.TXT>.
  Bytes that are not present in the source file return `:undefined`.
  """

  @external_resource Path.join([:code.priv_dir(:ex_pdf), "mac_roman.txt"])

  @table @external_resource
         |> File.read!()
         |> String.split("\n")
         |> Enum.flat_map(fn line ->
           case Regex.run(~r/^0x([0-9A-Fa-f]{2})\s+0x([0-9A-Fa-f]{4})/, line) do
             [_, byte_hex, code_hex] ->
               [{String.to_integer(byte_hex, 16), String.to_integer(code_hex, 16)}]

             _ ->
               []
           end
         end)

  @doc "Returns the number of byte→codepoint entries loaded from priv/mac_roman.txt."
  @spec entry_count() :: non_neg_integer()
  def entry_count, do: unquote(length(@table))

  @doc """
  Decode a single byte to a Unicode codepoint.

  Returns `:undefined` for bytes that have no mapping in Mac OS Roman.
  """
  @spec decode(0..255) :: non_neg_integer() | :undefined
  for {byte, codepoint} <- @table do
    def decode(unquote(byte)), do: unquote(codepoint)
  end

  def decode(byte) when is_integer(byte) and byte in 0..255, do: :undefined
end