defmodule Firebreak.Checks do
@moduledoc """
Tier-1 structural checks: facts you can read straight off the supervision
tree and each process module, no cross-module graph needed. Cheap, fast,
deterministic — the kind of thing that belongs in a CI gate.
"""
alias Firebreak.{Finding, ModuleInfo}
@one_for_all_threshold 3
@default_max_restarts 3
@subtree_strategies [:one_for_all, :rest_for_one]
@spec run([ModuleInfo.t()]) :: [Finding.t()]
def run(modules) do
by_name = Map.new(modules, &{&1.name, &1})
# The general lookup-or-create race defers to the narrower DynamicSupervisor +
# Registry check so a get-or-start that both already match reports once, under
# the more specific id.
dyn_race = for m <- modules, dynamic_registry_race(m) != [], into: MapSet.new(), do: m.name
Enum.flat_map(modules, fn m ->
one_for_all_blast(m) ++
missing_trap_exit(m) ++
shutdown_vs_intensity(m) ++
default_intensity(m) ++
supervisor_subtree_blast(m, by_name) ++
dynamic_registry_race(m) ++
lookup_or_create_race(m, dyn_race) ++
unhandled_port_exit(m) ++
dynamic_supervisor_restart_blast(m, by_name) ++
start_link_in_callback(m)
end)
end
# Starting a *whole process* with a direct `start_link` inside an OTP callback
# (`init`, `handle_*`, ...) is an anti-pattern: the callback runs on every
# (re)start or message, so the process is spawned again each time — leaking
# duplicates if it's unnamed, or crashing on `:already_started` if it's named.
# The started process should be a supervised child instead.
defp start_link_in_callback(%ModuleInfo{callback_starts: starts} = m) when starts != [] do
starts
|> Enum.uniq_by(fn {target, cb, _line} -> {target, cb} end)
|> Enum.map(fn {target, cb, line} ->
%Finding{
check: :start_link_in_callback,
# A `handle_call`/`handle_cast`/`handle_info`/`terminate` start re-spawns on
# every message (or at shutdown) — a genuine leak/duplicate hazard
# (:medium). `init` and `handle_continue` are *initialization* callbacks
# that run once per (re)start (handle_continue is the post-init
# continuation), so a start_link there is often a deliberate linked child —
# a lower-confidence smell (:low), flagged in case it should be supervised.
severity: if(cb in [:init, :handle_continue], do: :low, else: :medium),
confidence: :best_effort,
module: m.name,
file: m.file,
line: line || m.line,
message:
"starts #{inspect(target)} with a direct start_link inside the #{cb} callback. " <>
"#{cb} runs on every #{callback_when(cb)}, so #{inspect(target)} is (re)spawned each " <>
"time - it leaks duplicates if unnamed, or fails on :already_started if named. " <>
"Supervise #{inspect(target)} as a child instead.",
details: %{target: target, callback: cb}
}
end)
end
defp start_link_in_callback(%ModuleInfo{} = _m), do: []
defp callback_when(cb) when cb in [:init, :handle_continue], do: "(re)start of this process"
defp callback_when(_), do: "matching message"
# :one_for_all means any single child crash restarts every sibling.
defp one_for_all_blast(%ModuleInfo{strategy: :one_for_all, children: children} = m)
when length(children) >= @one_for_all_threshold do
n = length(children)
[
%Finding{
check: :one_for_all_blast_radius,
severity: if(n >= 6, do: :high, else: :medium),
confidence: confidence(m),
module: m.name,
file: m.file,
line: m.line,
message:
":one_for_all with #{n} children - any single child crash restarts all #{n}. " <>
"If these children aren't genuinely interdependent, a narrower strategy contains the blast.",
details: %{strategy: :one_for_all, child_count: n, children: child_names(children)}
}
]
end
defp one_for_all_blast(_), do: []
# A GenServer that links a process without trapping exits dies when it dies.
defp missing_trap_exit(
%ModuleInfo{kind: :genserver, traps_exit?: false, unsupervised_spawns: spawns} = m
)
when spawns != [] do
kinds = spawns |> Enum.map(&elem(&1, 0)) |> Enum.uniq() |> Enum.map_join(", ", &to_string/1)
line = spawns |> List.first() |> elem(1)
[
%Finding{
check: :missing_trap_exit,
severity: :high,
module: m.name,
file: m.file,
line: line || m.line,
message:
"links a process (#{kinds}) but does not trap exits; a crash in the linked process " <>
"takes #{inspect(m.name)} down with it. Use a supervised Task or set " <>
"Process.flag(:trap_exit, true).",
details: %{spawns: spawns}
}
]
end
defp missing_trap_exit(_), do: []
# Child shutdown longer than the supervisor's intensity window can burn the
# restart budget on intentional shutdowns alone.
defp shutdown_vs_intensity(%ModuleInfo{kind: kind} = m)
when kind in [:supervisor, :dynamic_supervisor] do
window = ModuleInfo.effective_max_seconds(m) * 1000
for c <- m.children, is_integer(c.shutdown), c.shutdown > window do
%Finding{
check: :shutdown_exceeds_intensity_window,
severity: :low,
confidence: confidence(m),
module: m.name,
file: m.file,
line: c.line || m.line,
message:
"child #{inspect(c.module)} shutdown timeout #{c.shutdown}ms exceeds the supervisor's " <>
"intensity window #{window}ms; a slow shutdown can exhaust the restart budget.",
details: %{shutdown: c.shutdown, window_ms: window, child: c.module}
}
end
end
defp shutdown_vs_intensity(_), do: []
# A tight restart budget guarding many children: one fast crash-loop takes the
# whole subtree down. Fires when the *effective* intensity is at or below the
# OTP default of 3-in-5s — which covers both the supervisor that never set it
# (static: max_restarts nil) and the one whose init/1 resolves to the default
# (exact: max_restarts 3).
defp default_intensity(%ModuleInfo{kind: kind, children: children} = m)
when kind in [:supervisor, :application] and length(children) >= 5 do
mr = ModuleInfo.effective_max_restarts(m)
ms = ModuleInfo.effective_max_seconds(m)
if mr <= @default_max_restarts do
n = length(children)
[
%Finding{
check: :default_restart_intensity,
severity: :info,
confidence: confidence(m),
module: m.name,
file: m.file,
line: m.line,
message: intensity_message(m, n, mr, ms),
details: %{max_restarts: mr, max_seconds: ms, child_count: n}
}
]
else
[]
end
end
defp default_intensity(_), do: []
# When the source never set intensity we can name it as "the default"; when we
# have a concrete number (explicit, or exact runtime data) we state it plainly.
defp intensity_message(%ModuleInfo{max_restarts: nil}, n, _mr, _ms) do
"relies on the default restart intensity (max_restarts: 3 in 5s) while supervising #{n} " <>
"children; if any child crash-loops faster than 3x/5s, this supervisor and all #{n} " <>
"children go down. Worth setting intensity deliberately."
end
defp intensity_message(_m, n, mr, ms) do
"runs a restart intensity of #{mr} in #{ms}s while supervising #{n} children; if any child " <>
"crash-loops faster than #{mr}x/#{ms}s, this supervisor and all #{n} children go down."
end
# :one_for_all / :rest_for_one is a blast-amplifier when the children are
# themselves supervisors: a crash doesn't just restart a sibling worker, it
# tears down and rebuilds entire sibling *subtrees*. The strategy that looks
# contained at this level fans out across everything beneath it.
defp supervisor_subtree_blast(%ModuleInfo{strategy: strategy, children: children} = m, by_name)
when strategy in @subtree_strategies do
sup_children =
children
|> Enum.filter(&supervisor_child?(&1, by_name))
|> Enum.map(& &1.module)
|> Enum.reject(&is_nil/1)
|> Enum.uniq()
if sup_children == [] do
[]
else
[
%Finding{
check: :supervisor_subtree_blast,
severity: if(length(sup_children) >= 2, do: :high, else: :medium),
confidence: confidence(m),
module: m.name,
file: m.file,
line: m.line,
message:
"#{inspect(strategy)} over #{length(sup_children)} child supervisor(s) " <>
"(#{inspect_mods(sup_children)}); a crash here restarts those whole subtrees, " <>
"not a single worker. #{strategy_hint(strategy)}",
details: %{strategy: strategy, supervisor_children: sup_children}
}
]
end
end
defp supervisor_subtree_blast(_, _), do: []
# Get-or-start race: a module that both looks up a Registry and starts children
# under a DynamicSupervisor is probably doing lookup-or-start. Two callers can
# race between the lookup miss and the start; one then gets
# `{:error, {:already_started, pid}}`. Heuristic — the lookup and the start may
# be unrelated — so it's info/best-effort and worded conditionally.
defp dynamic_registry_race(%ModuleInfo{dynamic_start?: true, edges: edges} = m) do
case Enum.filter(edges, &(&1.kind == :registry)) do
[] ->
[]
[first | _] = registry_edges ->
[
%Finding{
check: :dynamic_supervisor_registry_race,
severity: :info,
confidence: :best_effort,
module: m.name,
file: m.file,
line: first.line || m.line,
message:
"looks up a Registry and starts children under a DynamicSupervisor; if this is a " <>
"lookup-or-start, two concurrent callers can both miss the lookup and both start, " <>
"and one gets {:error, {:already_started, pid}}. Handle that return value, or " <>
"register via `:via` at start so a duplicate start fails atomically.",
details: %{registry_edges: length(registry_edges)}
}
]
end
end
defp dynamic_registry_race(_), do: []
# Generalised lookup-or-create race (Christakis & Sagonas, PADL 2010): a single
# function that reads the registry (`whereis`/`Registry.lookup`/...) and, in the
# same body, creates a registration (`register`/`start_link`/`spawn`). The read
# and create aren't atomic, so two callers can both miss and both create; the
# loser then crashes (a raising `register`) or gets `{:error, {:already_started}}`
# (a soft start), and its just-spawned process is left unregistered -- a ghost
# nothing supervises (the orphan check flags it separately if it is stateful and
# in no tree). Heuristic, so best-effort and worded conditionally. Defers to
# `dynamic_supervisor_registry_race`, which reports the DynamicSupervisor variant.
defp lookup_or_create_race(%ModuleInfo{lookup_or_create: [_ | _] = sites} = m, dyn_race) do
if MapSet.member?(dyn_race, m.name) do
[]
else
{fun, line, mech} = representative_site(sites)
[
%Finding{
check: :lookup_or_create_race,
# A raising `register` crashes the losing caller; a soft start only
# returns an error it may already handle.
severity: if(mech == :register_raises, do: :low, else: :info),
confidence: :best_effort,
module: m.name,
file: m.file,
line: line || m.line,
message: lookup_or_create_message(m, fun, mech),
details: %{function: fun, mechanism: mech, sites: length(sites)}
}
]
end
end
defp lookup_or_create_race(_m, _dyn_race), do: []
# Prefer a raising-register site (the sharper hazard) when the module has several.
defp representative_site(sites) do
Enum.find(sites, hd(sites), fn {_f, _l, mech} -> mech == :register_raises end)
end
defp lookup_or_create_message(%ModuleInfo{name: name}, fun, :register_raises) do
"#{inspect(name)}.#{fun} reads the registry then registers in the same call, with no " <>
"atomicity between the two. Two concurrent callers can both miss the lookup and both " <>
"register; the loser's register/2 raises (the name is already taken), and the process it " <>
"just spawned is left unregistered - a ghost nothing supervises. Register atomically at " <>
"start (`name:`/`:via`) so a duplicate fails cleanly instead of after a live spawn."
end
defp lookup_or_create_message(%ModuleInfo{name: name}, fun, :register_soft) do
"#{inspect(name)}.#{fun} looks up a process then starts one in the same call, with no " <>
"atomicity between the two. Two concurrent callers can both miss the lookup and both start; " <>
"one gets {:error, {:already_started, pid}} and its just-started process is discarded - a " <>
"ghost until it exits. Handle that return value, or register via `:via`/`name:` at start so " <>
"the duplicate start fails atomically."
end
# External-process bridge with no exit handling: a process module opens a port
# (`Port.open`/`:erlang.open_port`) but no `handle_info` clause handles the
# port's termination. A port is linked to its owner, so an abnormal exit either
# takes the owner down (no trap_exit) or arrives as an unmatched message
# (trap_exit) and is dropped -- either way the external program can die without
# the owner reacting. Best-effort: letting the linked exit crash-and-restart can
# be deliberate, so it's worded as a question, not an accusation.
defp unhandled_port_exit(
%ModuleInfo{kind: kind, opens_port: [_ | _] = ports, handles_port_exit: false} = m
)
when kind in [:genserver, :gen_statem] do
[
%Finding{
check: :unhandled_port_exit,
severity: :low,
confidence: :best_effort,
module: m.name,
file: m.file,
line: List.first(ports) || m.line,
message: port_exit_message(m),
details: %{opens_port_at: ports, traps_exit: m.traps_exit?}
}
]
end
defp unhandled_port_exit(_), do: []
defp port_exit_message(%ModuleInfo{name: name, traps_exit?: true}) do
"#{inspect(name)} opens a port but no handle_info clause handles its termination " <>
"(`{port, {:exit_status, _}}` / `{:EXIT, port, _}`). It traps exits, so the port's death " <>
"arrives as an unmatched message and is silently dropped - the external program can exit " <>
"without this process noticing. Match the exit_status/EXIT message and react."
end
defp port_exit_message(%ModuleInfo{name: name}) do
"#{inspect(name)} opens a port but no handle_info clause handles its termination " <>
"(`{port, {:exit_status, _}}`). It does not trap exits, so an abnormal port exit propagates " <>
"as a linked exit and takes this process down (relying on a supervisor to restart it). If the " <>
"external program should be monitored or restarted in place, open with `:exit_status` and " <>
"handle that message."
end
# A DynamicSupervisor holds processes the static (and exact) tree can't show:
# its children are added at runtime, so `init/1` reports it empty. We recover
# the child types from the `start_child` call sites. The hazard the tree hides:
# if this supervisor restarts (it crashes, a parent restarts it, or it trips
# its own intensity), every live child dies *at once* and none are recreated —
# they were started on demand, not declared. Anything holding a pid or `:via`
# name to one then gets `:noproc` until the app starts it again.
defp dynamic_supervisor_restart_blast(
%ModuleInfo{kind: :dynamic_supervisor, dynamic_children: dyn} = m,
by_name
)
when dyn != [] do
child_mods = dyn |> Enum.map(& &1.module) |> Enum.reject(&is_nil/1) |> Enum.uniq()
if child_mods == [] do
[]
else
stateful? = Enum.any?(child_mods, &stateful_child?(&1, by_name))
[
%Finding{
check: :dynamic_supervisor_restart_blast,
severity: if(stateful?, do: :low, else: :info),
confidence: :best_effort,
module: m.name,
file: m.file,
line: m.line,
message:
"dynamically supervises #{inspect_mods(child_mods)} (started at runtime via " <>
"start_child, so absent from the tree). A restart of this DynamicSupervisor " <>
"terminates every live instance at once and does not recreate them" <>
"#{if stateful?, do: ", losing their state", else: ""}; any process holding a pid " <>
"or `:via` name to one then gets `:noproc` until it is started again.",
details: %{dynamic_children: child_mods, stateful: stateful?}
}
]
end
end
defp dynamic_supervisor_restart_blast(_, _), do: []
defp stateful_child?(mod, by_name) do
case Map.get(by_name, mod) do
%ModuleInfo{} = cm -> ModuleInfo.stateful?(cm)
_ -> false
end
end
# A child is a supervisor if the (exact) spec said so, or if we parsed the
# child's own module and classified it as one.
defp supervisor_child?(%{type: :supervisor}, _by_name), do: true
defp supervisor_child?(%{module: mod}, by_name) when not is_nil(mod) do
case Map.get(by_name, mod) do
%ModuleInfo{} = cm -> ModuleInfo.supervisor?(cm)
_ -> false
end
end
defp supervisor_child?(_, _), do: false
defp strategy_hint(:one_for_all),
do: "If those subtrees are independent, :one_for_one isolates each crash."
defp strategy_hint(:rest_for_one),
do:
"Under :rest_for_one, every supervisor started after the crashed one also restarts; " <>
"order independent subtrees first."
defp inspect_mods(mods), do: Enum.map_join(mods, ", ", &inspect/1)
defp child_names(children) do
children |> Enum.map(& &1.module) |> Enum.reject(&is_nil/1)
end
defp confidence(%ModuleInfo{tree_source: :exact}), do: :exact
defp confidence(%ModuleInfo{}), do: :best_effort
end