lib/pdf/reader/images/jpeg.ex

defmodule Pdf.Reader.Images.JPEG do
  @moduledoc """
  JPEG (DCTDecode) image utilities for `Pdf.Reader`.

  ## SOF marker parsing

  Reads JPEG dimensions from the Start-of-Frame (SOF) marker without fully
  decoding the JPEG. This is a read-only scan used to populate `%Pdf.Reader.Image{}`
  width and height from the DCT-encoded bytes.

  ### SOF marker table

  Source: ITU-T T.81 / ISO/IEC 10918-1, § B.1.1.3 (marker syntax), Table B.1.

  Valid SOF markers (0xFF 0xCy):

  | Marker byte | Name  | Notes |
  |-------------|-------|-------|
  | 0xC0        | SOF0  | Baseline DCT |
  | 0xC1        | SOF1  | Extended sequential DCT |
  | 0xC2        | SOF2  | Progressive DCT |
  | 0xC3        | SOF3  | Lossless (sequential) |
  | 0xC5        | SOF5  | Differential sequential DCT |
  | 0xC6        | SOF6  | Differential progressive DCT |
  | 0xC7        | SOF7  | Differential lossless (sequential) |
  | 0xC9        | SOF9  | Extended sequential DCT (arithmetic) |
  | 0xCA        | SOF10 | Progressive DCT (arithmetic) |
  | 0xCB        | SOF11 | Lossless (arithmetic) |
  | 0xCD        | SOF13 | Differential sequential DCT (arithmetic) |
  | 0xCE        | SOF14 | Differential progressive DCT (arithmetic) |
  | 0xCF        | SOF15 | Differential lossless (arithmetic) |

  Non-SOF markers to skip:
  | 0xC4 | DHT | Huffman table |
  | 0xC8 | JPG | JPEG extensions |
  | 0xCC | DAC | Arithmetic coding conditioning |

  ### SOF segment layout (ITU-T T.81 § B.2.2)

      0xFF 0xCy   — marker (2 bytes)
      Ls          — segment length in bytes, big-endian uint16 (includes itself, not the marker)
      P           — sample precision (1 byte)
      Y           — number of lines / height (2 bytes, big-endian)
      X           — number of samples per line / width (2 bytes, big-endian)
      Nf          — number of components (1 byte)
      ... (Nf × 3 bytes of component spec — not read here)

  `dimensions/1` returns `{:ok, %{height: y, width: x}}` on success.
  """

  @doc """
  Scans a JPEG binary for a SOF marker and extracts height and width.

  Returns `{:ok, %{height: integer, width: integer}}` on success.
  Returns `{:error, :no_sof_marker}` if no SOF marker is found.
  Returns `{:error, :not_a_jpeg}` if the binary does not start with the JPEG SOI marker.

  Source: ITU-T T.81 / ISO/IEC 10918-1, § B.2.2.
  """
  @spec dimensions(binary()) ::
          {:ok, %{height: non_neg_integer(), width: non_neg_integer()}} | {:error, atom()}
  def dimensions(<<>>) do
    {:error, :not_a_jpeg}
  end

  def dimensions(<<0xFF, 0xD8, rest::binary>>) do
    scan_markers(rest)
  end

  def dimensions(_other) do
    {:error, :not_a_jpeg}
  end

  # ---------------------------------------------------------------------------
  # Internal — marker scanner
  # ---------------------------------------------------------------------------

  # Walk JPEG markers one by one until we find a SOF or exhaust the binary.
  # Each marker is: 0xFF <type> [<length_big_endian_16> <payload...>]
  # Markers without a length field (standalone markers): SOI (D8), EOI (D9),
  # RST0..RST7 (D0..D7), TEM (01). All other markers have a length field.
  defp scan_markers(<<>>) do
    {:error, :no_sof_marker}
  end

  # Consume padding 0xFF bytes
  defp scan_markers(<<0xFF, 0xFF, rest::binary>>) do
    scan_markers(<<0xFF, rest::binary>>)
  end

  # SOI marker (D8) — no length, continue
  defp scan_markers(<<0xFF, 0xD8, rest::binary>>) do
    scan_markers(rest)
  end

  # EOI marker (D9) — end of image, SOF not found
  defp scan_markers(<<0xFF, 0xD9, _rest::binary>>) do
    {:error, :no_sof_marker}
  end

  # RST markers (D0..D7) — no length
  defp scan_markers(<<0xFF, rst, rest::binary>>) when rst >= 0xD0 and rst <= 0xD7 do
    scan_markers(rest)
  end

  # TEM marker (01) — no length
  defp scan_markers(<<0xFF, 0x01, rest::binary>>) do
    scan_markers(rest)
  end

  # SOF marker (C0..CF except C4, C8, CC) — parse dimensions
  defp scan_markers(
         <<0xFF, sof, length_hi, length_lo, 0x08, height_hi, height_lo, width_hi, width_lo,
           _rest::binary>>
       )
       when sof in [
              0xC0,
              0xC1,
              0xC2,
              0xC3,
              0xC5,
              0xC6,
              0xC7,
              0xC9,
              0xCA,
              0xCB,
              0xCD,
              0xCE,
              0xCF
            ] do
    _length = length_hi * 256 + length_lo
    height = height_hi * 256 + height_lo
    width = width_hi * 256 + width_lo
    {:ok, %{height: height, width: width}}
  end

  # SOF marker with non-8-bit precision — still return dimensions
  defp scan_markers(
         <<0xFF, sof, length_hi, length_lo, _precision, height_hi, height_lo, width_hi, width_lo,
           _rest::binary>>
       )
       when sof in [
              0xC0,
              0xC1,
              0xC2,
              0xC3,
              0xC5,
              0xC6,
              0xC7,
              0xC9,
              0xCA,
              0xCB,
              0xCD,
              0xCE,
              0xCF
            ] do
    _length = length_hi * 256 + length_lo
    height = height_hi * 256 + height_lo
    width = width_hi * 256 + width_lo
    {:ok, %{height: height, width: width}}
  end

  # Any other marker with length field — skip it and continue
  defp scan_markers(<<0xFF, marker, length_hi, length_lo, rest::binary>>)
       when marker not in [0xD8, 0xD9, 0x01] and not (marker >= 0xD0 and marker <= 0xD7) do
    # Length includes the 2 length bytes but not the marker bytes
    # So payload size = length - 2
    payload_size = length_hi * 256 + length_lo - 2

    if payload_size >= 0 and byte_size(rest) >= payload_size do
      after_segment = binary_part(rest, payload_size, byte_size(rest) - payload_size)
      scan_markers(after_segment)
    else
      {:error, :truncated_jpeg}
    end
  end

  # Not enough bytes or unknown structure
  defp scan_markers(_) do
    {:error, :no_sof_marker}
  end
end