lib/geometrics/open_telemetry/handler.ex

defmodule Geometrics.OpenTelemetry.Handler do
  @moduledoc """
  Configures OpenTelemetry to collect traces from Phoenix, Ecto, and LiveView.
  """
  @dialyzer {:nowarn_function, connection_status: 1}
  @dialyzer {:nowarn_function, get_peer_data: 1}
  @dialyzer {:nowarn_function, get_user_agent: 1}
  @dialyzer {:nowarn_function, handle_cast: 2}
  @dialyzer {:nowarn_function, handle_exception: 4}
  @dialyzer {:nowarn_function, open_child_span: 4}

  use GenServer

  alias Geometrics.OpenTelemetry.Handler
  alias Geometrics.OpenTelemetry.Logger, as: OTLogger
  alias OpenTelemetry.Span
  alias OpenTelemetry.Tracer
  alias OpentelemetryPhoenix.Reason
  require Logger
  require OpenTelemetry.Tracer

  def start_link(args) do
    GenServer.start_link(__MODULE__, args, name: __MODULE__)
  end

  def init(_args) do
    setup()

    {:ok, nil}
  end

  @doc """
  Sets up telemetry attachments. Some bindings are registered via `OpentelemetryPhoenix`
  and `OpentelemetryEcto`, but since neither of those handle LiveView we attach our own
  bindings for those events.

  Some bindings are attached before the library code, so that we can add some normalized
  attributes across most events.
  """
  def setup() do
    #### exception attachments need to be registered *before* we setup
    #### OpentelemetryPhoenix or OpentelemetryEcto, so we can alter
    #### the OT spans before those libraries end any spans
    :telemetry.attach_many(
      "opentelemetry-error-handler",
      [
        [:phoenix, :endpoint, :exception],
        [:phoenix, :router_dispatch, :exception]
      ],
      &Handler.handle_errors/4,
      []
    )

    ecto_prefix = Application.get_env(:geometrics, :ecto_prefix)

    OpenTelemetry.get_tracer(:geometrics)
    OpentelemetryPhoenix.setup()

    if ecto_prefix,
      do: OpentelemetryEcto.setup(ecto_prefix, time_unit: :millisecond)

    #### start attachments need to be registered *after* we setup
    #### OpentelemetryPhoenix or OpentelemetryEcto, so OT spans are already
    #### created for the current process.
    :telemetry.attach_many(
      "opentelemetry-phoenix-handler",
      [
        [:phoenix, :endpoint, :start],
        [:phoenix, :router_dispatch, :start],
        [:phoenix, :error_rendered]
      ],
      &Handler.add_ot_span_to_logger/4,
      []
    )

    #### Our custom telemetry handlers operate outside the scope of the
    #### external libraries, so can be attached in any order.
    :telemetry.attach_many(
      "opentelemetry-exception-handler",
      [
        [:geometrics, :open_telemetry, :exit],
        [:phoenix, :live_view, :handle_event, :exception],
        [:phoenix, :live_view, :mount, :exception]
      ],
      &Handler.handle_exception/4,
      []
    )

    :telemetry.attach_many(
      "opentelemetry-start-handler",
      [
        [:phoenix, :live_view, :handle_event, :start],
        [:phoenix, :live_view, :mount, :start]
      ],
      &Handler.open_child_span/4,
      []
    )

    :telemetry.attach_many(
      "opentelemetry-stop-handler",
      [
        [:phoenix, :live_view, :handle_event, :stop],
        [:phoenix, :live_view, :mount, :stop]
      ],
      &Handler.handle_success/4,
      []
    )
  end

  @doc """
  Given a telemetry event signaling that an action has begun, open a new child span.
  Makes sure that a parent span is either created, or attached to, so that events appear
  as siblings in a single trace spanning the LiveView session.

  Note that child spans pull attributes off of the socket that are only present if the
  `:connect_info` is configured to include them in the Endpoint:

      socket "/live", Phoenix.LiveView.Socket,
        websocket: [
          connect_info: [
            :peer_data,
            :trace_context_headers,
            :user_agent,
            :x_headers,
            session: @session_options
          ]
        ]
  """
  def open_child_span([:phoenix, :live_view, :handle_event, :start], _payload, meta, _config) do
    start_span("HANDLE_EVENT #{meta.event}", meta.socket)
  end

  def open_child_span([:phoenix, :live_view, :mount, :start], _payload, meta, _config) do
    case meta.socket do
      %{root_pid: nil} ->
        start_span("INITIAL MOUNT", meta.socket)

      %{root_pid: _root_pid} ->
        start_span("SECOND MOUNT", meta.socket)
    end
  end

  defp start_span(name, socket) do
    span_ctx =
      Tracer.start_span("#{name} #{to_module(socket.view)}", %{kind: :SERVER})
      |> OTLogger.track_span_ctx()

    set_socket_span_attrs(span_ctx, socket)
    Tracer.set_current_span(span_ctx)
  end

  defp set_socket_span_attrs(span_ctx, socket) do
    peer_data = get_peer_data(socket)
    user_agent = get_user_agent(socket)

    attributes = [
      "http.host": socket.host_uri.host,
      "http.scheme": socket.host_uri.scheme,
      "http.user_agent": user_agent,
      "live_view.connection_status": connection_status(socket),
      "net.host.port": socket.host_uri.port,
      "net.peer.port": peer_data.port,
      "net.transport": :"IP.TCP"
    ]

    Span.set_attributes(span_ctx, attributes)
  end

  @doc """
  Caught exits are cast to ourselves, so that we do not interrupt the logger.
  """
  def handle_cast(
        {:caught_exit, _payload, %{spans: spans, reason: reason, stacktrace: stacktrace}},
        state
      ) do
    crash_attrs =
      Keyword.merge(
        [
          type: :exit,
          stacktrace: Exception.format_stacktrace(stacktrace)
        ],
        Reason.normalize(reason)
      )

    for span_ctx <- spans do
      Span.set_attribute(span_ctx, :status, :crash)
      Span.add_event(span_ctx, "crash", crash_attrs)
      Span.set_status(span_ctx, OpenTelemetry.status(:Error, "Error"))

      OpenTelemetry.Span.end_span(span_ctx)
    end

    {:noreply, state}
  end

  @doc """
  Intended for exceptions that happen during spans that have been opened by OpentelemetryPhoenix or OpentelemetryEcto.
  Those libraries do not put a high-level `:status` key in the data, which we do for our spans to normalize across different
  event types.
  """
  def handle_errors(_event, _payload, _metadata, _config) do
    span_ctx = Tracer.current_span_ctx()
    Span.set_attribute(span_ctx, :status, :error)
    Span.end_span(span_ctx)
  end

  @doc """
  Catch message around exits and exceptions, to make sure that spans are marked with crash/exception status and
  closed.
  """
  def handle_exception([:geometrics, :open_telemetry, :exit], payload, metadata, _config) do
    GenServer.cast(__MODULE__, {:caught_exit, payload, metadata})
  end

  def handle_exception(
        [:phoenix, :live_view, _, :exception],
        _payload,
        %{kind: kind, reason: reason, stacktrace: stacktrace},
        _config
      ) do
    exception_attrs =
      Keyword.merge(
        [
          type: kind,
          stacktrace: Exception.format_stacktrace(stacktrace)
        ],
        Reason.normalize(reason)
      )

    status = OpenTelemetry.status(:Error, "Error")
    span_ctx = Tracer.current_span_ctx()
    Span.set_attribute(span_ctx, :status, :error)

    Span.add_event(span_ctx, "exception", exception_attrs)
    Span.set_status(span_ctx, status)
    Span.end_span(span_ctx)

    pop_span_ctx()
  end

  @doc """
  Set status to `:ok` when a `:stop` event fires, which suggests that whatever happened
  was successful, ie did not crash or raise.
  """
  def handle_success([:phoenix, :live_view, _type, :stop], _payload, _meta, _config) do
    span_ctx = Tracer.current_span_ctx()
    Span.set_attribute(span_ctx, :status, :ok)
    Span.end_span(span_ctx)
    pop_span_ctx()
  end

  @doc """
  When crashes or exits occur, rather than exceptions, no telemetry fires to close open
  spans. To get around this, we keep a list of open spans in the `Logger.metadata`, and
  cheOTLogger in a `Geometrics.OpenTelemetry.Logger`.

  If a crash occurs where any `:ot_spans` live in the metadata, a `[:geometrics, :open_telemetry, :exit]`
  telemetry event is fired, which gets picked up by our `handle_exception` binding.
  """
  def add_ot_span_to_logger(event, _payload, _meta, _config) do
    Tracer.current_span_ctx()
    |> case do
      :undefined ->
        Logger.warn(
          "[#{__MODULE__}] (#{inspect(event)}) expected current process have an OpenTelemetry span, but does not"
        )

      span_ctx ->
        span_ctx
        |> OTLogger.track_span_ctx()
    end
  end

  #### See if any of this info is present on a socket in the context
  #### of GCP / DO.
  # defp client_ip(%{remote_ip: remote_ip} = conn) do
  #   case Plug.Conn.get_req_header(conn, "x-forwarded-for") do
  #     [] ->
  #       to_string(:inet_parse.ntoa(remote_ip))

  #     [client | _] ->
  #       client
  #   end
  # end

  defp pop_span_ctx() do
    OTLogger.pop_span_ctx()
    |> case do
      [parent | _] -> parent
      _ -> :undefined
    end
    |> Tracer.set_current_span()
  end

  defp get_peer_data(%{private: %{connect_info: %{peer_data: peer_data}}}), do: peer_data
  defp get_peer_data(_), do: %{port: nil, address: nil, ssl_cert: nil}

  defp get_user_agent(%{private: %{connect_info: %{user_agent: user_agent}}}), do: user_agent
  defp get_user_agent(_), do: ""

  defp connection_status(socket) do
    if Phoenix.LiveView.connected?(socket),
      do: "connected",
      else: "disconnected"
  end

  defp to_module(module) do
    module
    |> to_string()
    |> case do
      "Elixir." <> name -> name
      erlang_module -> ":#{erlang_module}"
    end
  end
end