defmodule PdfEx.Font.Decoder do
@moduledoc false
@win_ansi_overrides %{
128 => "€",
130 => "‚",
131 => "ƒ",
132 => "„",
133 => "…",
134 => "†",
135 => "‡",
136 => "ˆ",
137 => "‰",
138 => "Š",
139 => "‹",
140 => "Œ",
142 => "Ž",
145 => "'",
146 => "'",
147 => "“",
148 => "”",
149 => "•",
150 => "–",
151 => "—",
152 => "˜",
153 => "™",
154 => "š",
155 => "›",
156 => "œ",
158 => "ž",
159 => "Ÿ"
}
@glyph_map %{
"space" => " ",
"hyphen" => "-",
"endash" => "–",
"emdash" => "—",
"quoteright" => "'",
"quoteleft" => "'",
"quotedblright" => "”",
"quotedblleft" => "“",
"bullet" => "•",
"ellipsis" => "…",
"fi" => "fi",
"fl" => "fl"
}
@type to_unicode :: %{non_neg_integer() => String.t()}
@type encoding :: {:standard, atom()} | {:differences, %{non_neg_integer() => String.t()}}
@type decoder :: %{to_unicode: to_unicode(), encoding: encoding(), two_byte: boolean()} | nil
@doc "Decodes `bytes` to UTF-8 text. A `nil` decoder returns the bytes unchanged."
@spec decode(binary(), decoder()) :: String.t()
def decode(bytes, nil), do: bytes
def decode(bytes, %{to_unicode: to_unicode, encoding: encoding} = decoder) do
two_byte? =
Map.get(decoder, :two_byte, false) or
(to_unicode != %{} and Enum.any?(Map.keys(to_unicode), &(&1 > 255)))
if two_byte?,
do: decode_two_byte(bytes, to_unicode, encoding),
else: decode_one_byte(bytes, to_unicode, encoding)
end
@doc """
Inverse of `decode/2` for single-byte fonts: maps each UTF-8 grapheme of
`text` back to the font byte that decodes to it. Any grapheme with no byte
yields `{:error, :unencodable_text}` — never silently dropped.
"""
@spec encode(decoder(), String.t()) :: {:ok, binary()} | {:error, :unencodable_text}
def encode(decoder, text) when is_binary(text) do
rev = reverse_map(decoder)
text
|> String.graphemes()
|> Enum.reduce_while([], fn g, acc ->
case Map.fetch(rev, g) do
{:ok, byte} -> {:cont, [byte | acc]}
:error -> {:halt, :unencodable}
end
end)
|> case do
:unencodable -> {:error, :unencodable_text}
bytes -> {:ok, :binary.list_to_bin(Enum.reverse(bytes))}
end
end
@doc """
Re-encodes UTF-8 `text` to big-endian 2-byte CID codes by inverting the
ToUnicode map (for Identity-H editing). Unmapped grapheme →
`{:error, :unencodable_text}`.
"""
@spec encode_cid(decoder(), String.t()) :: {:ok, binary()} | {:error, :unencodable_text}
def encode_cid(%{to_unicode: to_unicode}, text) when is_binary(text) do
# Only codes that fit a 2-byte CID are encodable; a wider code (from an
# oversized ToUnicode entry) would truncate under <<code::16>> and write the
# wrong glyph, so exclude it — the grapheme becomes :unencodable_text instead.
rev =
Enum.reduce(to_unicode, %{}, fn
{code, str}, acc when code <= 0xFFFF -> Map.update(acc, str, code, &min(&1, code))
_, acc -> acc
end)
text
|> String.graphemes()
|> Enum.reduce_while([], fn g, acc ->
case Map.fetch(rev, g) do
{:ok, code} -> {:cont, [<<code::16>> | acc]}
:error -> {:halt, :unencodable}
end
end)
|> case do
:unencodable -> {:error, :unencodable_text}
chunks -> {:ok, chunks |> Enum.reverse() |> IO.iodata_to_binary()}
end
end
@doc "Builds a decoder from an optional ToUnicode map and `/Encoding` map. Pass `two_byte: true` for composite (Identity-H) fonts."
@spec build_decoder(to_unicode() | nil, map() | nil, [{:two_byte, boolean()}]) :: decoder()
def build_decoder(to_unicode_map, encoding_map, opts \\ []) do
%{
to_unicode: to_unicode_map || %{},
encoding: parse_encoding(encoding_map),
two_byte: Keyword.get(opts, :two_byte, false)
}
end
@doc "Parses an `/Encoding /Differences` array (`[code, /name, /name, code, …]`) into a `code => glyph_name` map."
@spec parse_differences([integer() | atom() | PdfEx.COS.Name.t()]) ::
%{non_neg_integer() => String.t()}
def parse_differences(differences) when is_list(differences) do
{_, result} =
Enum.reduce(differences, {0, %{}}, fn
code, {_current, acc} when is_integer(code) ->
{code, acc}
name, {current, acc} when is_atom(name) ->
{current + 1, Map.put(acc, current, Atom.to_string(name))}
%PdfEx.COS.Name{value: name}, {current, acc} ->
{current + 1, Map.put(acc, current, name)}
_, state ->
state
end)
result
end
def parse_differences(_), do: %{}
# TODO(roadmap): this reverse map is non-injective — multiple codes/CIDs can
# share a Unicode value (ligatures, NBSP vs space, soft-hyphen), and keeping
# min(code) means decode∘encode need not round-trip to the source byte, so an
# edit can emit the wrong glyph. Make encoding /Encoding- and CIDToGIDMap-aware
# instead of inverting the to-Unicode table.
defp reverse_map(decoder) do
Enum.reduce(0..255, %{}, fn code, acc ->
case String.graphemes(decode(<<code>>, decoder)) do
[single] -> Map.update(acc, single, code, fn existing -> min(existing, code) end)
_ -> acc
end
end)
end
defp parse_encoding(nil), do: {:standard, :win_ansi}
defp parse_encoding(%{Differences: diffs}) when is_list(diffs),
do: {:differences, parse_differences(diffs)}
defp parse_encoding(%{BaseEncoding: base}), do: {:standard, normalize_base(base)}
defp parse_encoding(_), do: {:standard, :win_ansi}
defp normalize_base(:WinAnsiEncoding), do: :win_ansi
# TODO(roadmap): MacRomanEncoding/StandardEncoding pass through as raw atoms
# and then silently decode as latin1 — a no-op masquerading as support. Add
# real data-driven encoding tables (and the AGL for @glyph_map, currently ~12
# entries) behind a Font/Encoding behaviour selected by /Subtype.
defp normalize_base(other), do: other
defp decode_one_byte(bytes, to_unicode, encoding) do
bytes
|> :binary.bin_to_list()
|> Enum.map_join("", &mapped_char(to_unicode, &1, encoding))
end
defp decode_two_byte(<<>>, _to_unicode, _encoding), do: ""
defp decode_two_byte(<<code::integer-size(16), rest::binary>>, to_unicode, encoding),
do: mapped_char(to_unicode, code, encoding) <> decode_two_byte(rest, to_unicode, encoding)
defp decode_two_byte(<<_>>, _to_unicode, _encoding), do: ""
defp mapped_char(%{} = to_unicode, code, encoding) do
case to_unicode do
%{^code => char} -> char
_ -> fallback_char(code, encoding)
end
end
defp fallback_char(code, encoding) when code < 256, do: apply_encoding(code, encoding)
defp fallback_char(code, _encoding) when code in 0xD800..0xDFFF, do: "�"
defp fallback_char(code, _encoding), do: <<code::utf8>>
defp apply_encoding(code, {:differences, diff_map}) do
case Map.get(diff_map, code) do
nil -> latin1_to_utf8(code)
name -> glyph_name_to_char(name)
end
end
defp apply_encoding(code, {:standard, :win_ansi}),
do: Map.get(@win_ansi_overrides, code) || latin1_to_utf8(code)
defp apply_encoding(code, _), do: latin1_to_utf8(code)
defp latin1_to_utf8(code) when code < 128, do: <<code>>
defp latin1_to_utf8(code), do: <<code::utf8>>
defp glyph_name_to_char(name), do: Map.get(@glyph_map, name, "")
end