defmodule Nerves.Runtime.Heart do
@moduledoc """
Functions for querying Nerves Heart and the device's watchdog
Nerves Heart integrates Erlang's
[heart](https://www.erlang.org/doc/man/heart.html) process with a hardware
watchdog. This makes it possible for a device to recover from a hang. The
way it works is that the Erlang runtime regularly checks that it's ok. If so,
it sends a message to `heart`. Nerves heart then pets the hardware watchdog.
If messages ever stop being sent to `heart`, the hardware watchdog will trip
and reboot the device. You can add additional health checks for your
application by providing a callback to `:heart.set_callback/2`.
See [nerves_heart](https://github.com/nerves-project/nerves_heart) for more
information.
"""
@typedoc """
Nerves Heart's current status
See [nerves_heart](https://github.com/nerves-project/nerves_heart) for more
information.
"""
require Logger
@type info() :: info_v2() | info_v1()
@typedoc """
Nerves Heart v2.x information
"""
@type info_v2() :: %{
program_name: String.t(),
program_version: Version.t(),
heartbeat_timeout: non_neg_integer(),
heartbeat_time_left: non_neg_integer(),
init_handshake_happened: boolean(),
init_handshake_timeout: non_neg_integer(),
init_handshake_time_left: non_neg_integer(),
init_grace_time_left: non_neg_integer(),
snooze_time_left: non_neg_integer(),
wdt_identity: String.t(),
wdt_firmware_version: non_neg_integer(),
wdt_last_boot: :power_on | :watchdog,
wdt_options: non_neg_integer() | [atom()],
wdt_pet_time_left: non_neg_integer(),
wdt_pre_timeout: non_neg_integer(),
wdt_timeout_left: non_neg_integer(),
wdt_timeout: non_neg_integer()
}
@typedoc """
Nerves Heart v1.x information
"""
@type info_v1() :: %{
program_name: String.t(),
program_version: Version.t(),
identity: String.t(),
firmware_version: non_neg_integer(),
options: non_neg_integer() | [atom()],
time_left: non_neg_integer(),
pre_timeout: non_neg_integer(),
timeout: non_neg_integer(),
last_boot: :power_on | :watchdog,
heartbeat_timeout: non_neg_integer()
}
@doc """
Return whether Nerves heart is running
If you're using a Nerves device, this always returns `true` except possibly
when porting Nerves to new hardware. It is a quick sanity check.
"""
@spec running?() :: boolean()
def running?() do
case status() do
{:ok, %{program_name: "nerves_heart"}} -> true
_ -> false
end
end
@doc """
Notify Nerves heart that initialization is complete
This can be used to ensure that the code that calls `:heart.set_callback/2`
gets run. To use, add the following to your projects `rel/vm.args.eex`:
```text
## Require an initialization handshake within 15 minutes
-env HEART_INIT_TIMEOUT 900
```
Then call `Nerves.Runtime.Heart.init_complete/0` after
`:heart.set_callback/2` is called.
Supported by Nerves Heart v2.0 and later
"""
@spec init_complete() :: :ok
def init_complete() do
# This must be run in another thread to avoid blocking the current
# thread when it is involved in the heart callback.
{:ok, _} =
Task.start(fn ->
with {:error, reason} <- run_command(~c"init_handshake", "~> 2.0") do
Logger.error("Heart: handshake failed due to #{reason}")
end
end)
:ok
end
@doc """
Initiate a reboot that's guarded by the hardware watchdog
Most users should call `Nerves.Runtime.reboot/0` instead which calls this and
shuts down the Erlang VM.
Support with Nerves Heart v2.0 and later.
"""
@spec guarded_reboot() :: :ok | {:error, atom()}
def guarded_reboot() do
run_command(~c"guarded_reboot", "~> 2.0")
end
@doc """
Initiate a poweroff that's guarded by the hardware watchdog
Most users should call `Nerves.Runtime.poweroff/0` instead which calls this
and shuts down the Erlang VM.
Support with Nerves Heart v2.0 and later.
"""
@spec guarded_poweroff() :: :ok | {:error, atom()}
def guarded_poweroff() do
run_command(~c"guarded_poweroff", "~> 2.0")
end
@doc """
Snooze heart related reboots for the next 15 minutes
Run this to buy some time if reboots from heart or hardware watchdog are
getting in the way.
Support with Nerves Heart v2.2 and later.
"""
@spec snooze() :: :ok | {:error, atom()}
def snooze() do
with {:error, :unresponsive} <- run_command(~c"snooze", "~> 2.2") do
# If snooze is unresponsive, that probably means that the heart callback
# is stuck. Unfortunately, we don't know which version of heart is being
# run either. Nerves Heart 2.2 and later support USR1. Previous versions
# exit (all signals would exit prior to 2.2). The caller is probably
# desperate, so give it a try.
kill_usr1_heart()
end
end
defp kill_usr1_heart() do
case System.cmd("killall", ["-USR1", "heart"]) do
{_, 0} -> :ok
_ -> {:error, :failed_to_snooze}
end
end
@doc """
Return the current Nerves Heart status
Errors are returned when not running Nerves Heart
"""
@spec status() :: {:ok, info()} | {:error, atom()}
def status() do
with {:ok, cmd} <- timed_cmd(:get_cmd, []) do
parse_cmd(cmd)
end
end
@doc """
Raising version of status/0
"""
@spec status!() :: info()
def status!() do
{:ok, results} = status()
results
end
defp run_command(cmd, requirement) when is_list(cmd) do
with :ok <- check_version(requirement) do
timed_cmd(:set_cmd, [cmd])
end
end
defp check_version(requirement) do
case status() do
{:ok, info} ->
if Version.match?(info.program_version, requirement) do
:ok
else
{:error, :unsupported}
end
error ->
error
end
end
defp timed_cmd(method, args, timeout \\ 1000) do
task = Task.async(fn -> safe_heart(method, args) end)
case Task.yield(task, timeout) || Task.shutdown(task, :brutal_kill) do
{:ok, result} ->
result
nil ->
Logger.warning("Heart: heart unresponsive. A heart callback is probably taking too long.")
{:error, :unresponsive}
end
end
defp safe_heart(method, args) do
apply(:heart, method, args)
rescue
ArgumentError ->
# When heart isn't running, an ArgumentError is raised
Logger.error("Heart: Erlang heart isn't running. Check vm.args.")
{:error, :no_heart}
end
@doc false
@spec parse_cmd(list()) :: {:ok, info()} | {:error, atom()}
def parse_cmd([]), do: {:error, :not_nerves_heart}
def parse_cmd(cmd) when is_list(cmd) do
result =
for kv_str <- String.split(to_string(cmd), "\n"),
kv = String.split(kv_str, "=", parts: 2),
parsed = parse_attribute(kv),
into: %{},
do: parsed
{:ok, result}
rescue
_ -> {:error, :parse_error}
end
# v1 and v2 parsers
defp parse_attribute(["program_name", str]), do: {:program_name, str}
defp parse_attribute(["program_version", str]), do: {:program_version, Version.parse!(str)}
defp parse_attribute(["heartbeat_timeout", str]),
do: {:heartbeat_timeout, atoi(str)}
# v1 parsers
defp parse_attribute(["identity", str]), do: {:identity, str}
defp parse_attribute(["firmware_version", str]), do: {:firmware_version, atoi(str)}
defp parse_attribute(["options", "0x" <> hex]), do: {:options, atoi(hex, 16)}
defp parse_attribute(["options", option_list]), do: {:options, parse_option_list(option_list)}
defp parse_attribute(["time_left", str]), do: {:time_left, atoi(str)}
defp parse_attribute(["pre_timeout", str]), do: {:pre_timeout, atoi(str)}
defp parse_attribute(["timeout", str]), do: {:timeout, atoi(str)}
defp parse_attribute(["last_boot", str]), do: {:last_boot, parse_last_boot(str)}
# v2 parsers
defp parse_attribute(["wdt_identity", str]), do: {:wdt_identity, str}
defp parse_attribute(["wdt_firmware_version", str]), do: {:wdt_firmware_version, atoi(str)}
defp parse_attribute(["wdt_options", "0x" <> hex]), do: {:wdt_options, atoi(hex, 16)}
defp parse_attribute(["wdt_options", option_list]),
do: {:wdt_options, parse_option_list(option_list)}
defp parse_attribute(["wdt_pet_time_left", str]), do: {:wdt_pet_time_left, atoi(str)}
defp parse_attribute(["wdt_pre_timeout", str]), do: {:wdt_pre_timeout, atoi(str)}
defp parse_attribute(["wdt_timeout", str]), do: {:wdt_timeout, atoi(str)}
defp parse_attribute(["wdt_time_left", str]), do: {:wdt_time_left, atoi(str)}
defp parse_attribute(["wdt_last_boot", str]), do: {:wdt_last_boot, parse_last_boot(str)}
defp parse_attribute(["heartbeat_time_left", str]), do: {:heartbeat_time_left, atoi(str)}
defp parse_attribute(["init_handshake_timeout", str]), do: {:init_handshake_timeout, atoi(str)}
defp parse_attribute(["snooze_time_left", str]), do: {:snooze_time_left, atoi(str)}
defp parse_attribute(["init_grace_time_left", str]), do: {:init_grace_time_left, atoi(str)}
defp parse_attribute(["init_handshake_time_left", str]),
do: {:init_handshake_time_left, atoi(str)}
defp parse_attribute(["init_handshake_happened", str]),
do: {:init_handshake_happened, parse_bool(str)}
# unknowns
defp parse_attribute([_unknown, _str]), do: nil
defp parse_attribute([""]), do: nil
# helpers
defp atoi(str), do: String.to_integer(str)
defp atoi(str, base), do: String.to_integer(str, base)
defp parse_last_boot("power_on"), do: :power_on
defp parse_last_boot("watchdog"), do: :watchdog
defp parse_last_boot(other), do: {:unknown, other}
defp parse_option_list(options) do
for s <- String.split(options, ","), s != "", do: String.to_atom(s)
end
defp parse_bool("1"), do: true
defp parse_bool("true"), do: true
defp parse_bool(_), do: false
end