lib/relyra/telemetry.ex

defmodule Relyra.Telemetry do
  @moduledoc """
  Telemetry catalog for Relyra SAML events.

  Relyra emits events using the `[:relyra, :saml, event_name, stage]` namespace.
  Each event follows the `:start`, `:stop`, and `:exception` triplet pattern and
  reports durations in `duration_ms`.

  ## Events

  ### login
  Emitted when an SP-initiated login flow starts.
  - Namespace: `[:relyra, :saml, :login]`
  - Measurements: `[:duration_ms]`
  - Metadata: `[:connection_id, :organization_id, :provider_preset, :flow, :binding, :outcome, :error_code]`

  ### authn_request
  Emitted when a SAML AuthnRequest is generated.
  - Namespace: `[:relyra, :saml, :authn_request]`
  - Measurements: `[:duration_ms, :xml_bytes, :base64_bytes, :request_store_latency_ms]`
  - Metadata: `[:connection_id, :organization_id, :provider_preset, :flow, :binding, :outcome, :error_code]`

  ### response.decode
  Emitted when an inbound SAMLResponse is base64-decoded and inflated.
  - Namespace: `[:relyra, :saml, :response, :decode]`
  - Measurements: `[:duration_ms, :base64_bytes, :xml_bytes]`
  - Metadata: `[:connection_id, :binding, :flow, :outcome, :error_code]`

  ### response.validate
  Emitted during the SAML protocol validation pipeline.
  - Namespace: `[:relyra, :saml, :response, :validate]`
  - Measurements: `[:duration_ms, :assertion_count]`
  - Metadata: `[:connection_id, :flow, :outcome, :error_code]`

  ### signature.verify
  Emitted when verifying XML signatures.
  - Namespace: `[:relyra, :saml, :signature, :verify]`
  - Measurements: `[:duration_ms]`
  - Metadata: `[:connection_id, :outcome, :error_code, :signature_algorithm, :digest_algorithm]`

  ### replay.check
  Emitted when checking for replayed assertions.
  - Namespace: `[:relyra, :saml, :replay, :check]`
  - Measurements: `[:duration_ms, :replay_store_latency_ms]`
  - Metadata: `[:connection_id, :issuer, :assertion_id, :outcome, :error_code]`

  ### user.map
  Emitted when mapping SAML principal to a host user.
  - Namespace: `[:relyra, :saml, :user, :map]`
  - Measurements: `[:duration_ms, :attribute_count]`
  - Metadata: `[:connection_id, :flow, :outcome, :error_code]`

  ### session.establish
  Emitted when handing off to the host session adapter.
  - Namespace: `[:relyra, :saml, :session, :establish]`
  - Measurements: `[:duration_ms]`
  - Metadata: `[:connection_id, :flow, :outcome, :error_code]`

  ### metadata.refresh
  Emitted when an operator-triggered metadata refresh starts or stops.
  - Namespace: `[:relyra, :saml, :metadata, :refresh]`
  - Measurements: `[:duration_ms]`
  - Metadata: `[:connection_id, :metadata_source_id, :source_kind, :trigger, :outcome, :error_code, :certificate_count]`

  ### metadata.import
  Emitted when an operator-triggered local XML import starts or stops.
  - Namespace: `[:relyra, :saml, :metadata, :import]`
  - Measurements: `[:duration_ms]`
  - Metadata: `[:connection_id, :source_kind, :trigger, :outcome, :error_code, :certificate_count]`

  ### metadata.auto_refresh
  Phase 21 scheduled metadata refresh. SEPARATE namespace from
  `metadata.refresh` per D-23: adopters can attach the "page me" handler
  only to the unattended channel.

  - Namespace: `[:relyra, :saml, :metadata, :auto_refresh]`

  Per-attempt span events (`Telemetry.span/3`):
  - `[:relyra, :saml, :metadata, :auto_refresh, :start]`
  - `[:relyra, :saml, :metadata, :auto_refresh, :stop]`
  - `[:relyra, :saml, :metadata, :auto_refresh, :exception]`
    - Measurements: `[:duration_ms]`
    - Metadata: `[:connection_id, :metadata_source_id, :source_kind, :trigger, :correlation_id, :outcome, :error_code, :certificate_count, :transient?, :counts_toward_suspend?]`

  State-transition events (one-shot, not span-bracketed):
  - `[:relyra, :saml, :metadata, :auto_refresh, :degraded]` — first
    transient failure that increments `consecutive_failure_count` (D-24)
  - `[:relyra, :saml, :metadata, :auto_refresh, :suspended]` — the
    attempt that crossed `consecutive_failure_count >= 5` and set
    `auto_suspended_until` (D-24, D-25)
  - `[:relyra, :saml, :metadata, :auto_refresh, :recovered]` — successful
    probe after a suspended period; counters reset to 0 (D-24, Pitfall 6)
    - Measurements: `%{}`
    - Metadata: `[:connection_id, :metadata_source_id, :consecutive_failure_count, :auto_suspended_until, :auto_suspended_reason, :error_code]`

  Validity warning event (at-most-once per validUntil window per source):
  - `[:relyra, :saml, :metadata, :auto_refresh, :validity_warning]` (D-14)
    - Measurements: `%{}`
    - Metadata: `[:connection_id, :metadata_source_id, :valid_until, :refresh_interval_seconds]`

  Empty-tick event:
  - `[:relyra, :saml, :metadata, :auto_refresh, :skipped]` (D-07)
    - Measurements: `%{}`
    - Metadata: `[:correlation_id, :count]`

  ### certificate.expiring
  Emitted when a SAML certificate is nearing expiration.
  - Namespace: `[:relyra, :saml, :certificate, :expiring]`
  - Measurements: `[:days_until_expiry]`
  - Metadata: `[:connection_id, :certificate_id, :fingerprint_sha256, :not_after]`

  Optional reference handler: `Relyra.Telemetry.Handlers.LogAlerts`
  (D-30, NOT default-attached). Adopters call
  `Relyra.Telemetry.Handlers.LogAlerts.attach/0` from their
  `Application.start/2` to opt in.
  """

  @doc false
  def execute(event, measurements, metadata \\ %{}) do
    :telemetry.execute([:relyra, :saml | List.wrap(event)], measurements, metadata)
  end

  @doc false
  def span(event, metadata, span_fun) do
    event_name = [:relyra, :saml | List.wrap(event)]
    start_metadata = normalize_metadata(metadata)
    start_time = System.monotonic_time()

    emit(event_name ++ [:start], %{system_time: System.system_time()}, start_metadata)

    try do
      case span_fun.() do
        {result, stop_metadata} ->
          stop_metadata = normalize_metadata(stop_metadata)

          emit(
            event_name ++ [:stop],
            %{duration_ms: duration_ms(start_time)},
            Map.merge(start_metadata, stop_metadata)
          )

          result

        result ->
          emit(
            event_name ++ [:stop],
            %{duration_ms: duration_ms(start_time)},
            start_metadata
          )

          result
      end
    rescue
      exception ->
        emit(
          event_name ++ [:exception],
          %{duration_ms: duration_ms(start_time)},
          Map.merge(start_metadata, %{
            kind: :error,
            reason: Exception.message(exception)
          })
        )

        reraise exception, __STACKTRACE__
    catch
      kind, reason ->
        emit(
          event_name ++ [:exception],
          %{duration_ms: duration_ms(start_time)},
          Map.merge(start_metadata, %{kind: kind, reason: inspect(reason)})
        )

        case kind do
          :throw -> throw(reason)
          :exit -> exit(reason)
          :error -> :erlang.error(reason)
        end
    end
  end

  defp normalize_metadata(metadata) when is_map(metadata), do: metadata
  defp normalize_metadata(nil), do: %{}
  defp normalize_metadata(metadata) when is_list(metadata), do: Map.new(metadata)

  defp duration_ms(start_time) do
    System.convert_time_unit(System.monotonic_time() - start_time, :native, :millisecond)
  end

  defp emit(event_name, measurements, metadata) do
    :telemetry.execute(event_name, measurements, metadata)
  end
end