Skip to main content

lib/firebreak/checks.ex

defmodule Firebreak.Checks do
  @moduledoc """
  Tier-1 structural checks: facts you can read straight off the supervision
  tree and each process module, no cross-module graph needed. Cheap, fast,
  deterministic — the kind of thing that belongs in a CI gate.
  """

  alias Firebreak.{Finding, ModuleInfo}

  @one_for_all_threshold 3
  @default_max_restarts 3
  @subtree_strategies [:one_for_all, :rest_for_one]

  @spec run([ModuleInfo.t()]) :: [Finding.t()]
  def run(modules) do
    by_name = Map.new(modules, &{&1.name, &1})

    # The general lookup-or-create race defers to the narrower DynamicSupervisor +
    # Registry check so a get-or-start that both already match reports once, under
    # the more specific id.
    dyn_race = for m <- modules, dynamic_registry_race(m) != [], into: MapSet.new(), do: m.name

    Enum.flat_map(modules, fn m ->
      one_for_all_blast(m) ++
        missing_trap_exit(m) ++
        shutdown_vs_intensity(m) ++
        default_intensity(m) ++
        supervisor_subtree_blast(m, by_name) ++
        dynamic_registry_race(m) ++
        lookup_or_create_race(m, dyn_race) ++
        unhandled_port_exit(m) ++
        dynamic_supervisor_restart_blast(m, by_name) ++
        start_link_in_callback(m)
    end)
  end

  # Starting a *whole process* with a direct `start_link` inside an OTP callback
  # (`init`, `handle_*`, ...) is an anti-pattern: the callback runs on every
  # (re)start or message, so the process is spawned again each time — leaking
  # duplicates if it's unnamed, or crashing on `:already_started` if it's named.
  # The started process should be a supervised child instead.
  defp start_link_in_callback(%ModuleInfo{callback_starts: starts} = m) when starts != [] do
    starts
    |> Enum.uniq_by(fn {target, cb, _line} -> {target, cb} end)
    |> Enum.map(fn {target, cb, line} ->
      %Finding{
        check: :start_link_in_callback,
        # A `handle_call`/`handle_cast`/`handle_info`/`terminate` start re-spawns on
        # every message (or at shutdown) — a genuine leak/duplicate hazard
        # (:medium). `init` and `handle_continue` are *initialization* callbacks
        # that run once per (re)start (handle_continue is the post-init
        # continuation), so a start_link there is often a deliberate linked child —
        # a lower-confidence smell (:low), flagged in case it should be supervised.
        severity: if(cb in [:init, :handle_continue], do: :low, else: :medium),
        confidence: :best_effort,
        module: m.name,
        file: m.file,
        line: line || m.line,
        message:
          "starts #{inspect(target)} with a direct start_link inside the #{cb} callback. " <>
            "#{cb} runs on every #{callback_when(cb)}, so #{inspect(target)} is (re)spawned each " <>
            "time - it leaks duplicates if unnamed, or fails on :already_started if named. " <>
            "Supervise #{inspect(target)} as a child instead.",
        details: %{target: target, callback: cb}
      }
    end)
  end

  defp start_link_in_callback(%ModuleInfo{} = _m), do: []

  defp callback_when(cb) when cb in [:init, :handle_continue], do: "(re)start of this process"
  defp callback_when(_), do: "matching message"

  # :one_for_all means any single child crash restarts every sibling.
  defp one_for_all_blast(%ModuleInfo{strategy: :one_for_all, children: children} = m)
       when length(children) >= @one_for_all_threshold do
    n = length(children)

    [
      %Finding{
        check: :one_for_all_blast_radius,
        severity: if(n >= 6, do: :high, else: :medium),
        confidence: confidence(m),
        module: m.name,
        file: m.file,
        line: m.line,
        message:
          ":one_for_all with #{n} children - any single child crash restarts all #{n}. " <>
            "If these children aren't genuinely interdependent, a narrower strategy contains the blast.",
        details: %{strategy: :one_for_all, child_count: n, children: child_names(children)}
      }
    ]
  end

  defp one_for_all_blast(_), do: []

  # A GenServer that links a process without trapping exits dies when it dies.
  defp missing_trap_exit(
         %ModuleInfo{kind: :genserver, traps_exit?: false, unsupervised_spawns: spawns} = m
       )
       when spawns != [] do
    kinds = spawns |> Enum.map(&elem(&1, 0)) |> Enum.uniq() |> Enum.map_join(", ", &to_string/1)
    line = spawns |> List.first() |> elem(1)

    [
      %Finding{
        check: :missing_trap_exit,
        severity: :high,
        module: m.name,
        file: m.file,
        line: line || m.line,
        message:
          "links a process (#{kinds}) but does not trap exits; a crash in the linked process " <>
            "takes #{inspect(m.name)} down with it. Use a supervised Task or set " <>
            "Process.flag(:trap_exit, true).",
        details: %{spawns: spawns}
      }
    ]
  end

  defp missing_trap_exit(_), do: []

  # Child shutdown longer than the supervisor's intensity window can burn the
  # restart budget on intentional shutdowns alone.
  defp shutdown_vs_intensity(%ModuleInfo{kind: kind} = m)
       when kind in [:supervisor, :dynamic_supervisor] do
    window = ModuleInfo.effective_max_seconds(m) * 1000

    for c <- m.children, is_integer(c.shutdown), c.shutdown > window do
      %Finding{
        check: :shutdown_exceeds_intensity_window,
        severity: :low,
        confidence: confidence(m),
        module: m.name,
        file: m.file,
        line: c.line || m.line,
        message:
          "child #{inspect(c.module)} shutdown timeout #{c.shutdown}ms exceeds the supervisor's " <>
            "intensity window #{window}ms; a slow shutdown can exhaust the restart budget.",
        details: %{shutdown: c.shutdown, window_ms: window, child: c.module}
      }
    end
  end

  defp shutdown_vs_intensity(_), do: []

  # A tight restart budget guarding many children: one fast crash-loop takes the
  # whole subtree down. Fires when the *effective* intensity is at or below the
  # OTP default of 3-in-5s — which covers both the supervisor that never set it
  # (static: max_restarts nil) and the one whose init/1 resolves to the default
  # (exact: max_restarts 3).
  defp default_intensity(%ModuleInfo{kind: kind, children: children} = m)
       when kind in [:supervisor, :application] and length(children) >= 5 do
    mr = ModuleInfo.effective_max_restarts(m)
    ms = ModuleInfo.effective_max_seconds(m)

    if mr <= @default_max_restarts do
      n = length(children)

      [
        %Finding{
          check: :default_restart_intensity,
          severity: :info,
          confidence: confidence(m),
          module: m.name,
          file: m.file,
          line: m.line,
          message: intensity_message(m, n, mr, ms),
          details: %{max_restarts: mr, max_seconds: ms, child_count: n}
        }
      ]
    else
      []
    end
  end

  defp default_intensity(_), do: []

  # When the source never set intensity we can name it as "the default"; when we
  # have a concrete number (explicit, or exact runtime data) we state it plainly.
  defp intensity_message(%ModuleInfo{max_restarts: nil}, n, _mr, _ms) do
    "relies on the default restart intensity (max_restarts: 3 in 5s) while supervising #{n} " <>
      "children; if any child crash-loops faster than 3x/5s, this supervisor and all #{n} " <>
      "children go down. Worth setting intensity deliberately."
  end

  defp intensity_message(_m, n, mr, ms) do
    "runs a restart intensity of #{mr} in #{ms}s while supervising #{n} children; if any child " <>
      "crash-loops faster than #{mr}x/#{ms}s, this supervisor and all #{n} children go down."
  end

  # :one_for_all / :rest_for_one is a blast-amplifier when the children are
  # themselves supervisors: a crash doesn't just restart a sibling worker, it
  # tears down and rebuilds entire sibling *subtrees*. The strategy that looks
  # contained at this level fans out across everything beneath it.
  defp supervisor_subtree_blast(%ModuleInfo{strategy: strategy, children: children} = m, by_name)
       when strategy in @subtree_strategies do
    sup_children =
      children
      |> Enum.filter(&supervisor_child?(&1, by_name))
      |> Enum.map(& &1.module)
      |> Enum.reject(&is_nil/1)
      |> Enum.uniq()

    if sup_children == [] do
      []
    else
      [
        %Finding{
          check: :supervisor_subtree_blast,
          severity: if(length(sup_children) >= 2, do: :high, else: :medium),
          confidence: confidence(m),
          module: m.name,
          file: m.file,
          line: m.line,
          message:
            "#{inspect(strategy)} over #{length(sup_children)} child supervisor(s) " <>
              "(#{inspect_mods(sup_children)}); a crash here restarts those whole subtrees, " <>
              "not a single worker. #{strategy_hint(strategy)}",
          details: %{strategy: strategy, supervisor_children: sup_children}
        }
      ]
    end
  end

  defp supervisor_subtree_blast(_, _), do: []

  # Get-or-start race: a module that both looks up a Registry and starts children
  # under a DynamicSupervisor is probably doing lookup-or-start. Two callers can
  # race between the lookup miss and the start; one then gets
  # `{:error, {:already_started, pid}}`. Heuristic — the lookup and the start may
  # be unrelated — so it's info/best-effort and worded conditionally.
  defp dynamic_registry_race(%ModuleInfo{dynamic_start?: true, edges: edges} = m) do
    case Enum.filter(edges, &(&1.kind == :registry)) do
      [] ->
        []

      [first | _] = registry_edges ->
        [
          %Finding{
            check: :dynamic_supervisor_registry_race,
            severity: :info,
            confidence: :best_effort,
            module: m.name,
            file: m.file,
            line: first.line || m.line,
            message:
              "looks up a Registry and starts children under a DynamicSupervisor; if this is a " <>
                "lookup-or-start, two concurrent callers can both miss the lookup and both start, " <>
                "and one gets {:error, {:already_started, pid}}. Handle that return value, or " <>
                "register via `:via` at start so a duplicate start fails atomically.",
            details: %{registry_edges: length(registry_edges)}
          }
        ]
    end
  end

  defp dynamic_registry_race(_), do: []

  # Generalised lookup-or-create race (Christakis & Sagonas, PADL 2010): a single
  # function that reads the registry (`whereis`/`Registry.lookup`/...) and, in the
  # same body, creates a registration (`register`/`start_link`/`spawn`). The read
  # and create aren't atomic, so two callers can both miss and both create; the
  # loser then crashes (a raising `register`) or gets `{:error, {:already_started}}`
  # (a soft start), and its just-spawned process is left unregistered -- a ghost
  # nothing supervises (the orphan check flags it separately if it is stateful and
  # in no tree). Heuristic, so best-effort and worded conditionally. Defers to
  # `dynamic_supervisor_registry_race`, which reports the DynamicSupervisor variant.
  defp lookup_or_create_race(%ModuleInfo{lookup_or_create: [_ | _] = sites} = m, dyn_race) do
    if MapSet.member?(dyn_race, m.name) do
      []
    else
      {fun, line, mech} = representative_site(sites)

      [
        %Finding{
          check: :lookup_or_create_race,
          # A raising `register` crashes the losing caller; a soft start only
          # returns an error it may already handle.
          severity: if(mech == :register_raises, do: :low, else: :info),
          confidence: :best_effort,
          module: m.name,
          file: m.file,
          line: line || m.line,
          message: lookup_or_create_message(m, fun, mech),
          details: %{function: fun, mechanism: mech, sites: length(sites)}
        }
      ]
    end
  end

  defp lookup_or_create_race(_m, _dyn_race), do: []

  # Prefer a raising-register site (the sharper hazard) when the module has several.
  defp representative_site(sites) do
    Enum.find(sites, hd(sites), fn {_f, _l, mech} -> mech == :register_raises end)
  end

  defp lookup_or_create_message(%ModuleInfo{name: name}, fun, :register_raises) do
    "#{inspect(name)}.#{fun} reads the registry then registers in the same call, with no " <>
      "atomicity between the two. Two concurrent callers can both miss the lookup and both " <>
      "register; the loser's register/2 raises (the name is already taken), and the process it " <>
      "just spawned is left unregistered - a ghost nothing supervises. Register atomically at " <>
      "start (`name:`/`:via`) so a duplicate fails cleanly instead of after a live spawn."
  end

  defp lookup_or_create_message(%ModuleInfo{name: name}, fun, :register_soft) do
    "#{inspect(name)}.#{fun} looks up a process then starts one in the same call, with no " <>
      "atomicity between the two. Two concurrent callers can both miss the lookup and both start; " <>
      "one gets {:error, {:already_started, pid}} and its just-started process is discarded - a " <>
      "ghost until it exits. Handle that return value, or register via `:via`/`name:` at start so " <>
      "the duplicate start fails atomically."
  end

  # External-process bridge with no exit handling: a process module opens a port
  # (`Port.open`/`:erlang.open_port`) but no `handle_info` clause handles the
  # port's termination. A port is linked to its owner, so an abnormal exit either
  # takes the owner down (no trap_exit) or arrives as an unmatched message
  # (trap_exit) and is dropped -- either way the external program can die without
  # the owner reacting. Best-effort: letting the linked exit crash-and-restart can
  # be deliberate, so it's worded as a question, not an accusation.
  defp unhandled_port_exit(
         %ModuleInfo{kind: kind, opens_port: [_ | _] = ports, handles_port_exit: false} = m
       )
       when kind in [:genserver, :gen_statem] do
    [
      %Finding{
        check: :unhandled_port_exit,
        severity: :low,
        confidence: :best_effort,
        module: m.name,
        file: m.file,
        line: List.first(ports) || m.line,
        message: port_exit_message(m),
        details: %{opens_port_at: ports, traps_exit: m.traps_exit?}
      }
    ]
  end

  defp unhandled_port_exit(_), do: []

  defp port_exit_message(%ModuleInfo{name: name, traps_exit?: true}) do
    "#{inspect(name)} opens a port but no handle_info clause handles its termination " <>
      "(`{port, {:exit_status, _}}` / `{:EXIT, port, _}`). It traps exits, so the port's death " <>
      "arrives as an unmatched message and is silently dropped - the external program can exit " <>
      "without this process noticing. Match the exit_status/EXIT message and react."
  end

  defp port_exit_message(%ModuleInfo{name: name}) do
    "#{inspect(name)} opens a port but no handle_info clause handles its termination " <>
      "(`{port, {:exit_status, _}}`). It does not trap exits, so an abnormal port exit propagates " <>
      "as a linked exit and takes this process down (relying on a supervisor to restart it). If the " <>
      "external program should be monitored or restarted in place, open with `:exit_status` and " <>
      "handle that message."
  end

  # A DynamicSupervisor holds processes the static (and exact) tree can't show:
  # its children are added at runtime, so `init/1` reports it empty. We recover
  # the child types from the `start_child` call sites. The hazard the tree hides:
  # if this supervisor restarts (it crashes, a parent restarts it, or it trips
  # its own intensity), every live child dies *at once* and none are recreated —
  # they were started on demand, not declared. Anything holding a pid or `:via`
  # name to one then gets `:noproc` until the app starts it again.
  defp dynamic_supervisor_restart_blast(
         %ModuleInfo{kind: :dynamic_supervisor, dynamic_children: dyn} = m,
         by_name
       )
       when dyn != [] do
    child_mods = dyn |> Enum.map(& &1.module) |> Enum.reject(&is_nil/1) |> Enum.uniq()

    if child_mods == [] do
      []
    else
      stateful? = Enum.any?(child_mods, &stateful_child?(&1, by_name))

      [
        %Finding{
          check: :dynamic_supervisor_restart_blast,
          severity: if(stateful?, do: :low, else: :info),
          confidence: :best_effort,
          module: m.name,
          file: m.file,
          line: m.line,
          message:
            "dynamically supervises #{inspect_mods(child_mods)} (started at runtime via " <>
              "start_child, so absent from the tree). A restart of this DynamicSupervisor " <>
              "terminates every live instance at once and does not recreate them" <>
              "#{if stateful?, do: ", losing their state", else: ""}; any process holding a pid " <>
              "or `:via` name to one then gets `:noproc` until it is started again.",
          details: %{dynamic_children: child_mods, stateful: stateful?}
        }
      ]
    end
  end

  defp dynamic_supervisor_restart_blast(_, _), do: []

  defp stateful_child?(mod, by_name) do
    case Map.get(by_name, mod) do
      %ModuleInfo{} = cm -> ModuleInfo.stateful?(cm)
      _ -> false
    end
  end

  # A child is a supervisor if the (exact) spec said so, or if we parsed the
  # child's own module and classified it as one.
  defp supervisor_child?(%{type: :supervisor}, _by_name), do: true

  defp supervisor_child?(%{module: mod}, by_name) when not is_nil(mod) do
    case Map.get(by_name, mod) do
      %ModuleInfo{} = cm -> ModuleInfo.supervisor?(cm)
      _ -> false
    end
  end

  defp supervisor_child?(_, _), do: false

  defp strategy_hint(:one_for_all),
    do: "If those subtrees are independent, :one_for_one isolates each crash."

  defp strategy_hint(:rest_for_one),
    do:
      "Under :rest_for_one, every supervisor started after the crashed one also restarts; " <>
        "order independent subtrees first."

  defp inspect_mods(mods), do: Enum.map_join(mods, ", ", &inspect/1)

  defp child_names(children) do
    children |> Enum.map(& &1.module) |> Enum.reject(&is_nil/1)
  end

  defp confidence(%ModuleInfo{tree_source: :exact}), do: :exact
  defp confidence(%ModuleInfo{}), do: :best_effort
end