lib/vintage_net/power_manager/pm_control.ex

defmodule VintageNet.PowerManager.PMControl do
  @moduledoc """
  Power management control GenServer

  This GenServer runs a PowerManager implementation for a network device. It
  provides the API for powering on and off a device and for signally that it's
  working well.

  Internally, it runs a state machine that translates power on and off requests
  into actions sent to the `PowerManager` implementation. The state machine
  handles the timing of actions so that hardware gets a chance to initialize
  and isn't reset to quickly. `PowerManager` implementations specify times.

  Since networking devices can sometimes hang or fail in unexpected ways, this
  module can power them off and on to try to get them back in a good state.
  This is implemented in terms of a watchdog. Code that can detect the network
  device being in a good state should call `pet_watchdog/1`. For example, code
  that checks internet connectivity could call `pet_watchdog/1` since that's
  a pretty good sign that the device works. Other checks are possible. If
  `pet_watchdog/1` isn't called, this module will restart the network device.
  """
  use GenServer
  alias VintageNet.PowerManager.StateMachine
  require Logger

  # Filter out poweroff/resets that appear in <10ms. These are generated by
  # programmatically removing and reapplying a configuration.  These are a
  # consequence of VintageNet's strategy of reapplying configurations always
  # and not trying to figure out deltas, even for small stuff. The user almost
  # certainly doesn't want to wait through the shutdown timeouts and boot time
  # to use the device again and that's unnecessary anyway.
  @transient_timeout 10

  @default_watchdog_timeout 60_000

  defmodule State do
    @moduledoc false

    defstruct [
      :impl,
      :impl_args,
      :impl_state,
      :ifname,
      :pm_state,
      :sm,
      :timer_id,
      :timer_ref,
      :watchdog_timeout
    ]
  end

  @doc """
  Start up a server

  This is intended to be called via `VintageNet.PowerManager.Supervisor`

  Arguments:

  * `:impl` - the module that implements PowerManager
  * `:impl_args` - arguments to pass to the PowerManager's `init/1` call
  """
  @spec start_link(keyword()) :: GenServer.on_start()
  def start_link(args) do
    # See PowerManager.Supervisor for enforcement of this key
    ifname = args[:impl_args][:ifname]
    GenServer.start_link(__MODULE__, args, name: via_name(ifname))
  end

  defp via_name(ifname) do
    {:via, Registry, {VintageNet.PowerManager.Registry, ifname}}
  end

  @doc """
  Power on

  This should be called whenever an interface should be powered on. It
  can be called more than once. If you want the network interface to
  be on, it is always safe to call this. An internal state machine will
  ignore redundant calls.
  """
  @spec power_on(VintageNet.ifname()) :: :ok
  def power_on(ifname) do
    GenServer.cast(via_name(ifname), :power_on)
  end

  @doc """
  Power off

  This is called when VintageNet stops using an interface. The current state in
  the power management state machine determines how this is handled. For
  example, the power could already be off.
  """
  @spec power_off(VintageNet.ifname()) :: :ok
  def power_off(ifname) do
    GenServer.cast(via_name(ifname), :power_off)
  end

  @doc """
  Pet watchdog

  Call this whenever the network connection is in a good state. If it has
  not been called by the watchdog timeout, the device will be rebooted.
  """
  @spec pet_watchdog(VintageNet.ifname()) :: :ok
  def pet_watchdog(ifname) do
    GenServer.cast(via_name(ifname), :pet_watchdog)
  end

  @doc """
  Force reset

  This is intended to be called based on human interaction. For example,
  by a UI button or by a developer who knows or strongly suspects that
  something is wrong with the network device it needs a reboot.

  Resetting devices that have been powered off will NOT power them on.

  Calling this automatically is not recommended especially if it is used
  as an alternative to the watchdog mechanism. The reason is that it is
  easier to identify where the device is working than it is to identify
  every way it can fail. Also, force reset ignores minimum on times
  since assumption is that if someone wants to reset, they're ready
  to reset now.
  """
  @spec force_reset(VintageNet.ifname()) :: :ok
  def force_reset(ifname) do
    GenServer.cast(via_name(ifname), :force_reset)
  end

  @doc """
  Send an arbitrary message to the power manager for an interface

  This will be received by the PowerManager's `handle_info/2` callback.
  """
  @spec send_message(VintageNet.ifname(), any()) :: any()
  def send_message(ifname, message) do
    case GenServer.whereis(via_name(ifname)) do
      nil -> :ok
      pid -> send(pid, message)
    end
  end

  @doc """
  Return information about the specified power manager

  NOTE: the map returned may change in the future
  """
  @spec info(VintageNet.ifname()) :: {:ok, map()} | :error
  def info(ifname) do
    case GenServer.whereis(via_name(ifname)) do
      nil -> :error
      pid -> GenServer.call(pid, :info)
    end
  end

  @impl GenServer
  def init(opts) do
    state = %State{
      impl: opts[:impl],
      impl_args: opts[:impl_args],
      sm: StateMachine.init(),
      ifname: opts[:impl_args][:ifname],
      pm_state: :off,
      watchdog_timeout: opts[:impl_args][:watchdog_timeout] || @default_watchdog_timeout,
      timer_id: nil,
      timer_ref: make_ref()
    }

    case safe_init(state) do
      {:ok, impl_state} ->
        {:ok, %{state | impl_state: impl_state}}

      error ->
        Logger.error(
          "VintageNet: #{state.impl} failed to init and not retrying: #{inspect(error)}"
        )

        :ignore
    end
  end

  defp safe_init(state) do
    state.impl.init(state.impl_args)
  rescue
    e ->
      Logger.error(Exception.format(:error, e, __STACKTRACE__))
      {:error, e}
  end

  @impl GenServer
  def handle_call(:info, _from, state) do
    time_left = time_left(state)

    info = %{
      manager: state.impl,
      init_args: state.impl_args,
      time_left: time_left,
      pm_info: StateMachine.info(state.sm, time_left),
      pm_state: state.pm_state
    }

    {:reply, {:ok, info}, state}
  end

  @impl GenServer
  def handle_cast(request, state)
      when request in [:power_on, :power_off, :pet_watchdog, :force_reset] do
    {new_sm, actions} = apply(StateMachine, request, [state.sm])

    new_state = Enum.reduce(actions, %{state | sm: new_sm}, &run_action/2)

    {:noreply, new_state}
  end

  @impl GenServer
  def handle_info({:server_timeout, timer_id}, %{timer_id: timer_id} = state) do
    {new_sm, actions} = StateMachine.timeout(state.sm)

    new_state = Enum.reduce(actions, %{state | sm: new_sm, timer_id: nil}, &run_action/2)

    {:noreply, new_state}
  end

  def handle_info({:server_timeout, _timer_id}, state) do
    # Ignore old timeouts
    {:noreply, state}
  end

  def handle_info(msg, state) do
    {:noreply, new_impl_state} = run_callback(state, :handle_info, [msg, state.impl_state])

    {:noreply, %{state | impl_state: new_impl_state}}
  end

  defp run_action(:start_powering_off, state) do
    Logger.info([log_prefix(state), "Start powering off"])

    {:ok, new_impl_state, shutdown_time} =
      run_callback(state, :start_powering_off, [state.impl_state])

    %{state | impl_state: new_impl_state, pm_state: :powering_off}
    |> start_timer(shutdown_time)
  end

  defp run_action(:power_off, state) do
    Logger.info([log_prefix(state), "Complete power off"])
    {:ok, new_impl_state, min_off_time} = run_callback(state, :power_off, [state.impl_state])

    %{state | impl_state: new_impl_state, pm_state: :off}
    |> start_timer(min_off_time)
  end

  defp run_action(:power_on, state) do
    Logger.info([log_prefix(state), "Powering on"])
    {:ok, new_impl_state, hold_time} = run_callback(state, :power_on, [state.impl_state])

    %{state | impl_state: new_impl_state, pm_state: :on}
    |> start_timer(hold_time)
  end

  defp run_action(:start_transient_timer, state) do
    start_timer(state, @transient_timeout)
  end

  defp run_action(:start_watchdog_timer, state) do
    start_timer(state, state.watchdog_timeout)
  end

  defp run_callback(state, callback, args) do
    apply(state.impl, callback, args)
  catch
    kind, reason ->
      Logger.error([
        log_prefix(state),
        "callback #{callback} raised #{inspect(kind)}, #{inspect(reason)}. Exiting"
      ])

      exit(:callback_failed)
  end

  defp start_timer(state, millis) do
    timer_id = make_ref()
    timer_ref = Process.send_after(self(), {:server_timeout, timer_id}, millis)

    %{state | timer_id: timer_id, timer_ref: timer_ref}
  end

  defp time_left(state) do
    case Process.read_timer(state.timer_ref) do
      false -> 0
      milliseconds -> milliseconds
    end
  end

  defp log_prefix(state) do
    ["PMControl(", state.ifname, "): "]
  end
end