Skip to main content

lib/thread_index.ex

defmodule ThreadIndex do
  @moduledoc """
  Encode and decode the Outlook `Thread-Index` email header
  (MAPI `PidTagConversationIndex`, Microsoft Graph `conversationIndex`).

  A conversation index is a 22-byte header block followed by one 5-byte child
  block per reply, base64-encoded when transported in the `Thread-Index` MIME
  header. This library decodes both header variants found in the wild and
  encodes replies byte-compatible with what Outlook/Exchange themselves
  produce, so threads keep grouping and ordering correctly in Outlook.

  ## Format summary

  Header block (22 bytes): 6 time-derived bytes + 16-byte conversation GUID.
  Two variants exist (MS-OXOMSG 2.2.1.3 and its Appendix A footnote <2>):

    * `:classic` — desktop Outlook 2007-2019, Exchange 2007-2010. Bytes 0-5
      hold `FILETIME >>> 16` of the original message time (the leading byte is
      `0x01` for any date between 1829 and 2057 and doubles as the documented
      "reserved byte"). Base64 starts with `Ac`/`Ad`/`Ae`.

    * `:modern` — Exchange 2013+, Exchange Online, OWA, Graph API. Byte 0 is
      the reserved `0x01`, bytes 1-5 hold `FILETIME >>> 24`. Base64 starts
      with `AQ` (typically `AQH`).

  Child block (5 bytes): 1 bit delta code (DC), 31 bits time delta, 8 bits
  random. DC=0 stores `delta >>> 18` (26.2 ms units, max ~1.78 years), DC=1
  stores `delta >>> 23` (0.84 s units, max ~57 years).

  ## The two undocumented behaviors this library accounts for

  1. **Deltas are cumulative.** Each child's delta is relative to the
     accumulated time of the previous child block, not to the header (the
     MS-OXOMSG prose says "header", Microsoft's own code says otherwise).

  2. **Deltas chain from the classic read of the header — and wrap.** All
     Microsoft appenders (MAPI `cindex.c`, Exchange's `ConversationIndex`)
     compute the chain anchor by reading the header time bytes the classic
     way, even for `:modern` headers, where that read lands around year 1831.
     The resulting ~195-year virtual delta forces DC=1 and the 31-bit field
     silently truncates mod 2^31 — i.e. wraps modulo ~57 years (cf. footnote
     <3>: "Exchange 2013/2016/2019 set the Delta Code field to 1 and do not
     calculate the Time Delta field based on TimeDiff"). Decoding recovers
     true reply dates by re-running the same chain arithmetic and adding the
     minimal number of `2^(31+shift)`-tick wrap windows needed to land at or
     after the header date.

  Child timestamps record when each reply was *composed* (reply draft
  created), not when it was sent or delivered, so expect them to precede the
  displayed message times by the composition duration.

  ## Examples

      iex> {:ok, index} =
      ...>   "01CDE90ABFE0D78F0E4280824120B2F1D0E3C07ED0070000CCBA300000114460"
      ...>   |> Base.decode16!()
      ...>   |> Base.encode64()
      ...>   |> ThreadIndex.decode()
      iex> {index.format, index.date}
      {:classic, ~U[2013-01-02 17:01:04.168550Z]}
      iex> Enum.map(index.replies, & &1.date)
      [~U[2013-01-02 17:23:58.065254Z], ~U[2013-01-02 17:25:53.932902Z]]

  ## References

    * [MS-OXOMSG 2.2.1.3 PidTagConversationIndex](https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxomsg/9e994fbb-b839-495f-84e3-2c8c02c7dd9b)
      and its Appendix A footnotes <2> and <3>
    * [MAPI "Tracking Conversations"](https://learn.microsoft.com/en-us/office/client-developer/outlook/mapi/tracking-conversations)
    * Microsoft MAPI SDK sample `cindex.c`; decompiled
      `Microsoft.Exchange.Data.Storage.ConversationIndex`
    * [Meridian Discovery: E-mail Conversation Index Analysis](https://www.meridiandiscovery.com/how-to/e-mail-conversation-index-metadata-computer-forensics/)
    * [Metaspike community: Thread-Index Header Field](https://community.metaspike.com/t/thread-index-header-field/175)
  """

  import Bitwise

  defmodule Reply do
    @moduledoc """
    One decoded 5-byte child block of a conversation index.

    `date` is the recovered absolute composition time of the reply.
    `random` is the raw 8-bit uniqueness field (legacy MAPI documentation
    splits it into a 4-bit random number and a 4-bit sequence count).
    """

    defstruct [:date, :delta_code, :random]

    @type t :: %__MODULE__{
            date: DateTime.t(),
            delta_code: 0 | 1,
            random: 0..255
          }
  end

  defstruct [:format, :date, :guid, replies: []]

  @type format :: :classic | :modern

  @type t :: %__MODULE__{
          format: format(),
          date: DateTime.t(),
          guid: <<_::128>>,
          replies: [Reply.t()]
        }

  # Windows FILETIME is 100ns ticks since 1601-01-01 00:00:00Z
  @epoch1601 ~U[1601-01-01 00:00:00Z]

  # DC=0 stores delta bits 48..18, DC=1 stores delta bits 53..23
  @dc0_shift 18
  @dc1_shift 23

  # When undoing the mod-2^31 wrap we aim for "at/after the header date", minus
  # this slack (~107s in FILETIME ticks) to absorb the down-truncation of the
  # low delta bits for replies composed moments after the original message.
  @wrap_slack 1 <<< 30

  @doc """
  Decodes a base64 `Thread-Index` value into a `t:ThreadIndex.t/0`.

  Returns `{:error, :invalid_base64}` if the input is not base64 and
  `{:error, :invalid_length}` if the decoded binary is not 22 + 5n bytes long.
  """
  @spec decode(String.t()) :: {:ok, t()} | {:error, :invalid_base64 | :invalid_length}
  def decode(base64) when is_binary(base64) do
    case Base.decode64(base64) do
      {:ok, raw} -> decode_binary(raw)
      :error -> {:error, :invalid_base64}
    end
  end

  @doc """
  Same as `decode/1` but raises `ArgumentError` on invalid input.
  """
  @spec decode!(String.t()) :: t()
  def decode!(base64) do
    case decode(base64) do
      {:ok, index} -> index
      {:error, reason} -> raise ArgumentError, "cannot decode thread index: #{reason}"
    end
  end

  @doc """
  Decodes a raw (not base64-encoded) conversation index binary, as found in
  the MAPI `PidTagConversationIndex` property.
  """
  @spec decode_binary(binary()) :: {:ok, t()} | {:error, :invalid_length}
  def decode_binary(<<first6::binary-size(6), guid::binary-size(16), rest::binary>>)
      when rem(byte_size(rest), 5) == 0 do
    format = header_format(first6)
    header_ft = header_filetime(format, first6)

    index = %__MODULE__{
      format: format,
      date: filetime_to_datetime(header_ft),
      guid: guid,
      replies: decode_children(rest, chain_anchor(first6), header_ft, [])
    }

    {:ok, index}
  end

  def decode_binary(raw) when is_binary(raw), do: {:error, :invalid_length}

  @doc """
  Encodes a 22-byte root conversation index for a new conversation thread.

  ## Options

    * `:time` — `DateTime` of the message, defaults to `DateTime.utc_now/0`
    * `:guid` — 16-byte conversation GUID, defaults to a random one
    * `:format` — `:classic` (default, desktop Outlook layout) or `:modern`
      (Exchange/OWA layout)

  ## Examples

      iex> guid = Base.decode16!("D78F0E4280824120B2F1D0E3C07ED007")
      iex> root = ThreadIndex.encode_root(guid: guid, time: ~U[2025-01-01 10:00:00Z])
      iex> ThreadIndex.decode!(root).date
      ~U[2025-01-01 09:59:59.997952Z]
  """
  @spec encode_root(keyword()) :: String.t()
  def encode_root(opts \\ []) do
    time = Keyword.get(opts, :time) || DateTime.utc_now()
    guid = Keyword.get(opts, :guid) || :crypto.strong_rand_bytes(16)
    format = Keyword.get(opts, :format, :classic)

    if byte_size(guid) != 16 do
      raise ArgumentError, "guid must be exactly 16 bytes, got #{byte_size(guid)}"
    end

    filetime = datetime_to_filetime(time)

    header =
      case format do
        :classic -> <<filetime >>> 16::unsigned-big-48>>
        :modern -> <<1, filetime >>> 24::unsigned-big-40>>
      end

    Base.encode64(header <> guid)
  end

  @doc """
  Appends a reply child block to an existing base64 conversation index,
  reproducing Microsoft's own arithmetic (see the moduledoc): the delta is
  computed against the cumulative chain anchored at the classic read of the
  header bytes, masked to 31 bits.

  The output is byte-compatible with what Outlook/Exchange would append for
  the same timestamps, so mixed threads stay consistent.

  ## Options

    * `:time` — `DateTime` of the reply, defaults to `DateTime.utc_now/0`
    * `:random` — the 8-bit uniqueness byte (0..255), defaults to a random
      value; pass it explicitly for reproducible output

  ## Examples

      iex> guid = Base.decode16!("D78F0E4280824120B2F1D0E3C07ED007")
      iex> root = ThreadIndex.encode_root(guid: guid, time: ~U[2025-01-01 10:00:00Z])
      iex> reply = ThreadIndex.encode_reply(root, time: ~U[2025-01-01 10:30:00Z], random: 0xAB)
      iex> [%ThreadIndex.Reply{delta_code: 0, random: 0xAB} = r] = ThreadIndex.decode!(reply).replies
      iex> r.date
      ~U[2025-01-01 10:29:59.983513Z]
  """
  @spec encode_reply(String.t(), keyword()) :: String.t()
  def encode_reply(parent_base64, opts \\ []) do
    time = Keyword.get(opts, :time) || DateTime.utc_now()
    random = Keyword.get(opts, :random) || :rand.uniform(256) - 1

    parent = Base.decode64!(parent_base64)
    <<first6::binary-size(6), _guid::binary-size(16), rest::binary>> = parent

    anchor = advance_chain(rest, chain_anchor(first6))

    # Mirror Microsoft's encoders (MAPI cindex.c / Exchange ConversationIndex):
    # current time with the low 16 bits masked off, absolute difference.
    diff = abs((datetime_to_filetime(time) &&& bnot(0xFFFF)) - anchor)

    # DC=0 only when the delta fits in 31 bits after dropping the low 18 bits
    # (< 2^49 ticks ~ 1.78 years; MS-OXOMSG tests this as "high order part of
    # the delta AND 0x00FE0000"). For modern-rooted threads the virtual delta
    # is ~195 years, so this is always DC=1 — same as Exchange.
    {dc, shift} = if diff < 1 <<< 49, do: {0, @dc0_shift}, else: {1, @dc1_shift}
    time_delta = diff >>> shift &&& 0x7FFFFFFF

    Base.encode64(<<parent::binary, dc::1, time_delta::unsigned-big-31, random::8>>)
  end

  # Modern headers carry the reserved 0x01 followed by the FILETIME high byte,
  # which is 0x01 until year 2057. Under the classic read, byte 1 holds
  # FILETIME bits 55..48, which is >= 0xB0 for any date after 1990 — so a
  # small byte 1 unambiguously identifies the modern variant.
  defp header_format(<<1, b1, _::binary>>) when b1 < 0x80, do: :modern
  defp header_format(_first6), do: :classic

  defp header_filetime(:modern, <<_reserved, ft_high40::unsigned-big-40>>), do: ft_high40 <<< 24
  defp header_filetime(:classic, <<ft_high48::unsigned-big-48>>), do: ft_high48 <<< 16

  # The anchor all real appenders chain child deltas from: the classic read,
  # regardless of the actual header variant.
  defp chain_anchor(<<ft_high48::unsigned-big-48>>), do: ft_high48 <<< 16

  defp decode_children(<<>>, _anchor, _header_ft, acc), do: Enum.reverse(acc)

  defp decode_children(
         <<dc::1, time_delta::unsigned-big-31, random::8, rest::binary>>,
         anchor,
         header_ft,
         acc
       ) do
    shift = if dc == 0, do: @dc0_shift, else: @dc1_shift
    anchor = anchor + (time_delta <<< shift)

    # Undo the mod-2^31 truncation: add the minimal number of wrap windows
    # required to land at/after the true header date (less a small slack,
    # since delta truncation may put a child marginally before it).
    window = 1 <<< (31 + shift)
    target = header_ft - @wrap_slack
    k = if anchor >= target, do: 0, else: div(target - anchor + window - 1, window)

    reply = %Reply{
      date: filetime_to_datetime(anchor + k * window),
      delta_code: dc,
      random: random
    }

    decode_children(rest, anchor, header_ft, [reply | acc])
  end

  defp advance_chain(<<>>, anchor), do: anchor

  defp advance_chain(<<dc::1, time_delta::unsigned-big-31, _random::8, rest::binary>>, anchor) do
    shift = if dc == 0, do: @dc0_shift, else: @dc1_shift
    advance_chain(rest, anchor + (time_delta <<< shift))
  end

  defp datetime_to_filetime(dt) do
    DateTime.diff(dt, @epoch1601, :microsecond) * 10
  end

  defp filetime_to_datetime(filetime) do
    DateTime.add(@epoch1601, div(filetime, 10), :microsecond)
  end
end