Skip to main content

lib/pdf_ex/font/decoder.ex

defmodule PdfEx.Font.Decoder do
  @moduledoc false

  @win_ansi_overrides %{
    128 => "€",
    130 => "‚",
    131 => "ƒ",
    132 => "„",
    133 => "…",
    134 => "†",
    135 => "‡",
    136 => "ˆ",
    137 => "‰",
    138 => "Š",
    139 => "‹",
    140 => "Œ",
    142 => "Ž",
    145 => "'",
    146 => "'",
    147 => "“",
    148 => "”",
    149 => "•",
    150 => "–",
    151 => "—",
    152 => "˜",
    153 => "™",
    154 => "š",
    155 => "›",
    156 => "œ",
    158 => "ž",
    159 => "Ÿ"
  }

  @glyph_map %{
    "space" => " ",
    "hyphen" => "-",
    "endash" => "–",
    "emdash" => "—",
    "quoteright" => "'",
    "quoteleft" => "'",
    "quotedblright" => "”",
    "quotedblleft" => "“",
    "bullet" => "•",
    "ellipsis" => "…",
    "fi" => "fi",
    "fl" => "fl"
  }

  @type to_unicode :: %{non_neg_integer() => String.t()}
  @type encoding :: {:standard, atom()} | {:differences, %{non_neg_integer() => String.t()}}
  @type decoder :: %{to_unicode: to_unicode(), encoding: encoding(), two_byte: boolean()} | nil

  @doc "Decodes `bytes` to UTF-8 text. A `nil` decoder returns the bytes unchanged."
  @spec decode(binary(), decoder()) :: String.t()
  def decode(bytes, nil), do: bytes

  def decode(bytes, %{to_unicode: to_unicode, encoding: encoding} = decoder) do
    two_byte? =
      Map.get(decoder, :two_byte, false) or
        (to_unicode != %{} and Enum.any?(Map.keys(to_unicode), &(&1 > 255)))

    if two_byte?,
      do: decode_two_byte(bytes, to_unicode, encoding),
      else: decode_one_byte(bytes, to_unicode, encoding)
  end

  @doc """
  Inverse of `decode/2` for single-byte fonts: maps each UTF-8 grapheme of
  `text` back to the font byte that decodes to it. Any grapheme with no byte
  yields `{:error, :unencodable_text}` — never silently dropped.
  """
  @spec encode(decoder(), String.t()) :: {:ok, binary()} | {:error, :unencodable_text}
  def encode(decoder, text) when is_binary(text) do
    rev = reverse_map(decoder)

    text
    |> String.graphemes()
    |> Enum.reduce_while([], fn g, acc ->
      case Map.fetch(rev, g) do
        {:ok, byte} -> {:cont, [byte | acc]}
        :error -> {:halt, :unencodable}
      end
    end)
    |> case do
      :unencodable -> {:error, :unencodable_text}
      bytes -> {:ok, :binary.list_to_bin(Enum.reverse(bytes))}
    end
  end

  @doc """
  Re-encodes UTF-8 `text` to big-endian 2-byte CID codes by inverting the
  ToUnicode map (for Identity-H editing). Unmapped grapheme →
  `{:error, :unencodable_text}`.
  """
  @spec encode_cid(decoder(), String.t()) :: {:ok, binary()} | {:error, :unencodable_text}
  def encode_cid(%{to_unicode: to_unicode}, text) when is_binary(text) do
    # Only codes that fit a 2-byte CID are encodable; a wider code (from an
    # oversized ToUnicode entry) would truncate under <<code::16>> and write the
    # wrong glyph, so exclude it — the grapheme becomes :unencodable_text instead.
    rev =
      Enum.reduce(to_unicode, %{}, fn
        {code, str}, acc when code <= 0xFFFF -> Map.update(acc, str, code, &min(&1, code))
        _, acc -> acc
      end)

    text
    |> String.graphemes()
    |> Enum.reduce_while([], fn g, acc ->
      case Map.fetch(rev, g) do
        {:ok, code} -> {:cont, [<<code::16>> | acc]}
        :error -> {:halt, :unencodable}
      end
    end)
    |> case do
      :unencodable -> {:error, :unencodable_text}
      chunks -> {:ok, chunks |> Enum.reverse() |> IO.iodata_to_binary()}
    end
  end

  @doc "Builds a decoder from an optional ToUnicode map and `/Encoding` map. Pass `two_byte: true` for composite (Identity-H) fonts."
  @spec build_decoder(to_unicode() | nil, map() | nil, [{:two_byte, boolean()}]) :: decoder()
  def build_decoder(to_unicode_map, encoding_map, opts \\ []) do
    %{
      to_unicode: to_unicode_map || %{},
      encoding: parse_encoding(encoding_map),
      two_byte: Keyword.get(opts, :two_byte, false)
    }
  end

  @doc "Parses an `/Encoding /Differences` array (`[code, /name, /name, code, …]`) into a `code => glyph_name` map."
  @spec parse_differences([integer() | atom() | PdfEx.COS.Name.t()]) ::
          %{non_neg_integer() => String.t()}
  def parse_differences(differences) when is_list(differences) do
    {_, result} =
      Enum.reduce(differences, {0, %{}}, fn
        code, {_current, acc} when is_integer(code) ->
          {code, acc}

        name, {current, acc} when is_atom(name) ->
          {current + 1, Map.put(acc, current, Atom.to_string(name))}

        %PdfEx.COS.Name{value: name}, {current, acc} ->
          {current + 1, Map.put(acc, current, name)}

        _, state ->
          state
      end)

    result
  end

  def parse_differences(_), do: %{}

  # TODO(roadmap): this reverse map is non-injective — multiple codes/CIDs can
  # share a Unicode value (ligatures, NBSP vs space, soft-hyphen), and keeping
  # min(code) means decode∘encode need not round-trip to the source byte, so an
  # edit can emit the wrong glyph. Make encoding /Encoding- and CIDToGIDMap-aware
  # instead of inverting the to-Unicode table.
  defp reverse_map(decoder) do
    Enum.reduce(0..255, %{}, fn code, acc ->
      case String.graphemes(decode(<<code>>, decoder)) do
        [single] -> Map.update(acc, single, code, fn existing -> min(existing, code) end)
        _ -> acc
      end
    end)
  end

  defp parse_encoding(nil), do: {:standard, :win_ansi}

  defp parse_encoding(%{Differences: diffs}) when is_list(diffs),
    do: {:differences, parse_differences(diffs)}

  defp parse_encoding(%{BaseEncoding: base}), do: {:standard, normalize_base(base)}
  defp parse_encoding(_), do: {:standard, :win_ansi}

  defp normalize_base(:WinAnsiEncoding), do: :win_ansi
  # TODO(roadmap): MacRomanEncoding/StandardEncoding pass through as raw atoms
  # and then silently decode as latin1 — a no-op masquerading as support. Add
  # real data-driven encoding tables (and the AGL for @glyph_map, currently ~12
  # entries) behind a Font/Encoding behaviour selected by /Subtype.
  defp normalize_base(other), do: other

  defp decode_one_byte(bytes, to_unicode, encoding) do
    bytes
    |> :binary.bin_to_list()
    |> Enum.map_join("", &mapped_char(to_unicode, &1, encoding))
  end

  defp decode_two_byte(<<>>, _to_unicode, _encoding), do: ""

  defp decode_two_byte(<<code::integer-size(16), rest::binary>>, to_unicode, encoding),
    do: mapped_char(to_unicode, code, encoding) <> decode_two_byte(rest, to_unicode, encoding)

  defp decode_two_byte(<<_>>, _to_unicode, _encoding), do: ""

  defp mapped_char(%{} = to_unicode, code, encoding) do
    case to_unicode do
      %{^code => char} -> char
      _ -> fallback_char(code, encoding)
    end
  end

  defp fallback_char(code, encoding) when code < 256, do: apply_encoding(code, encoding)
  defp fallback_char(code, _encoding) when code in 0xD800..0xDFFF, do: "�"
  defp fallback_char(code, _encoding), do: <<code::utf8>>

  defp apply_encoding(code, {:differences, diff_map}) do
    case Map.get(diff_map, code) do
      nil -> latin1_to_utf8(code)
      name -> glyph_name_to_char(name)
    end
  end

  defp apply_encoding(code, {:standard, :win_ansi}),
    do: Map.get(@win_ansi_overrides, code) || latin1_to_utf8(code)

  defp apply_encoding(code, _), do: latin1_to_utf8(code)

  defp latin1_to_utf8(code) when code < 128, do: <<code>>
  defp latin1_to_utf8(code), do: <<code::utf8>>

  defp glyph_name_to_char(name), do: Map.get(@glyph_map, name, "")
end