lib/trip_switch.ex

defmodule TripSwitch do
  @moduledoc """
  TripSwitch is an Elixir implementation of a circuit breaker. The circuit
  breaker is a popular pattern in softwares that depends on various
  components/services to keep running.

  This library allows you wrap calls (signals) to remote calls in a trip switch.
  The switch monitors each invocation and records failures, the trip switch gets
  broken once the recorded failures reaches or surpasses the specified threshold.

  ## Signal

  A signal is simple a function that gets invoked. The state of the trip switch gets
  updated based on the result (current) of this signal, see `t:TripSwitch.signal/0`.

  ### Examples

      iex> TripSwitch.send(id, fn -> {:ok, :good} end)
      {:ok, :good}
      iex> TripSwitch.send(id, fn -> {:error, :bad} end) # record this signal as a failure
      {:error, :bad}

  Note that a switch only stops invoking a signal after the failure threshold have been
  reached. In this case, the switch keeps is considered `broken` until it's repaired.

  ## Repairs

  It's obvious you are asking yourself what you should do when a trip switch gets broken. This
  library have a concept called Repair. What this means is that a switch is sent for repair
  automatically once it gets broken. Not all trip switches are considered fixable, you
  need to specify a `repair_time` (in milliseconds) when the trip switch is created
  else you need to manually fix (reset) the trip switch whenever it gets broken.

  ## Thresholds

  For a trip switch to function correctly, a threshold is needed to be specified. This allows
  us to define the capacity/expectation of the trip switch. The threshold is specified as
  a float value, usually between 0 and 1 (ex. 0.2566 or 0.13 and so on).

  Simply, a threshold is the percentage of bad signals a trip switch receives before it gets
  broken and sent for repair.

  ## Usage

  To create a trip switch you need to add it to your supervision tree:

  ```elixir
  children = [{TripSwitch, name: :switch, threshold: 0.5}]

  Supervisor.start_link(children, opts)
  ```

  It supports following options:

  - `:name` - this will be used as the id for the switch
  - `:threshold` - the maximum number of thresholds before the switch trips
  - `:repair_time` - time taken for the switch to get fixed after it gets broken

  ## Telemetry

  Being a big fan of observability, this library exposes information about it internals
  using the community approved library (`telemetry`). You can use the `telemetry_poller`
  library to poll these metrics. Below are events that this library publishes.

  Below are events published by `trip_switch`:

  * `[:trip_switch, :signal, :start]` - dispatched before a given signal is handled
    * Measurement: `%{system_time: system_time}`
    * Metadata: `%{id: atom(), tag: String.t()}`

  * `[:trip_switch, :signal, :stop]` - dispatched after a given signal have been handled
    * Measurement: ` %{duration: native_time}`
    * Metadata: `%{id: atom(), tag: String.t()}`

  * `[:trip_switch, :repair, :start]` - dispatched when auto-repair is scheduled
    * Measurement: `%{system_time: system_time}`
    * Metadata: `%{id: atom(), tag: String.t()}`

  * `[:trip_switch, :repair, :stop]` - dispatched after an auto-repair have been completed
    * Measurement: ` %{duration: native_time}`
    * Metadata: `%{id: atom(), tag: String.t()}`
  """
  use GenServer

  alias TripSwitch.Breaker

  @type signal :: (() -> Breaker.current())

  @event_prefix :trip_switch

  @doc """
  Checks if the switch is broken.

  A switch is considered broken if the underlying breaker is `half_open` or `open`.

  ## Examples

      iex> TripSwitch.broken?(:switch)
      true
  """
  @spec broken?(atom()) :: boolean()
  def broken?(id), do: Breaker.broken?(get(id))

  @doc """
  Send a signal to the underlying breaker.

  The signal is a function that performs some actions then return
  a result that alters the state of the underlying breaker.

  ## Examples

      iex> TripSwitch.send(:switch, fn -> {:ok, %{name: "a"}}) end)
      {:ok, %{name: "a"}}
      iex> TripSwitch.send(:switch, fn -> {:error, :not_found} end)
      {:error, :not_found}
  """
  @spec send(atom(), signal()) :: {:ok, term()} | :broken
  def send(id, signal) do
    metadata = %{id: id, tag: FlakeId.get()}

    :telemetry.span([@event_prefix, :signal], metadata, fn ->
      with %Breaker{} = breaker <- get(id),
           false <- Breaker.broken?(breaker),
           {result, breaker} <- Breaker.handle(breaker, signal.()),
           :ok <- GenServer.call(via(id), {:save, breaker}) do
        {result, metadata}
      else
        true -> {:broken, metadata}
      end
    end)
  end

  @doc """
  Reset the given switch.

  Calling this function on a broken switch returns it to a working state.

  ### Examples

        iex> TripSwitch.reset(:switch)
        :ok
  """
  @spec reset(atom()) :: :ok
  def reset(id), do: GenServer.call(via(id), :reset)

  @spec child_spec(keyword()) :: Supervisor.child_spec()
  def child_spec(opts) do
    unless name = opts[:name] do
      raise ArgumentError, "expected :name option to be present"
    end

    Supervisor.child_spec(super(opts), id: {__MODULE__, name})
  end

  @impl GenServer
  def init(opts), do: {:ok, %{breaker: Breaker.new(opts), repair: nil}}

  @doc false
  @spec start_link(keyword()) :: GenServer.on_start()
  def start_link(opts) do
    name = Keyword.fetch!(opts, :name)

    GenServer.start_link(__MODULE__, opts, name: via(name))
  end

  @impl GenServer
  def handle_call(:get, _from, %{breaker: breaker} = state) do
    {:reply, breaker, state}
  end

  def handle_call(:reset, _from, %{breaker: breaker} = state) do
    state = cancel_timer(%{state | breaker: Breaker.reset(breaker)})

    {:reply, :ok, state}
  end

  def handle_call({:save, breaker}, _from, state) do
    state = schedule_or_cancel_repair(breaker, state)

    {:reply, :ok, state}
  end

  @impl GenServer
  def handle_info({:repair, start_time, tag}, %{breaker: breaker} = state) do
    state = cancel_timer(%{state | breaker: Breaker.repair(breaker)})

    :ok = emit_repair_stop_event(get_id(), start_time, tag)

    {:noreply, state}
  end

  defp schedule_or_cancel_repair(%Breaker{repair_time: at} = breaker, state) do
    with true <- Breaker.repairable?(breaker),
         state <- cancel_timer(state),
         {start_time, tag} <- emit_repair_start_event(get_id()),
         timer <- Process.send_after(self(), {:repair, start_time, tag}, at) do
      %{state | breaker: breaker, repair: timer}
    else
      false -> cancel_timer(%{state | breaker: breaker})
    end
  end

  defp get_id do
    [id] = Registry.keys(TripSwitch.Registry, self())
    id
  end

  defp cancel_timer(state) do
    case state.repair do
      ref when is_reference(ref) ->
        Process.cancel_timer(ref)
        %{state | repair: nil}

      nil ->
        state
    end
  end

  defp emit_repair_start_event(id) do
    tag = FlakeId.get()
    now = System.system_time()
    metadata = %{id: id, tag: tag}
    measurements = %{monotonic_time: System.monotonic_time(), system_time: now}

    :ok = :telemetry.execute([@event_prefix, :repair, :start], measurements, metadata)

    {now, tag}
  end

  defp emit_repair_stop_event(id, start_time, tag) do
    now = System.monotonic_time()
    metadata = %{id: id, tag: tag}

    measurements = %{
      monotonic_time: now,
      duration: start_time - now
    }

    :ok = :telemetry.execute([@event_prefix, :repair, :stop], measurements, metadata)
  end

  defp get(id), do: GenServer.call(via(id), :get)
  defp via(id), do: {:via, Registry, {TripSwitch.Registry, id}}
end