lib/skill_kit/eval/result.ex

Select File
lib/skill_kit/eval/result.ex

defmodule SkillKit.Eval.Result do
  @moduledoc """
  The outcome of running one eval: the eval, the captured transcript, and the
  list of `%SkillKit.Eval.Check{}` scored against it.

  An eval passes only when every check passes. `failure_message/1` renders the
  failing checks and transcript into the message ExUnit shows when a generated
  eval test fails.
  """

  alias SkillKit.Eval
  alias SkillKit.Eval.Check
  alias SkillKit.Eval.Transcript

  @type t :: %__MODULE__{
          eval: Eval.t(),
          transcript: Transcript.t(),
          checks: [Check.t()],
          cached: boolean()
        }

  @enforce_keys [:eval, :transcript, :checks]
  defstruct [:eval, :transcript, :checks, cached: false]

  @doc "True when every check passed."
  @spec passed?(t()) :: boolean()
  def passed?(%__MODULE__{checks: checks}), do: Enum.all?(checks, & &1.passed)

  @doc "The checks that failed."
  @spec failures(t()) :: [Check.t()]
  def failures(%__MODULE__{checks: checks}), do: Enum.reject(checks, & &1.passed)

  @doc "Non-fatal warning notes from checks that passed (e.g. the judge's)."
  @spec warnings(t()) :: [String.t()]
  def warnings(%__MODULE__{checks: checks}) do
    checks
    |> Enum.filter(& &1.warning)
    |> Enum.map(& &1.warning)
  end

  @doc """
  A human-readable explanation of why the eval failed, for ExUnit output.

  Renders the failing checks (including the judge's verdict and reasoning) plus
  the captured transcript — the prompt sent, the tools the agent called, and its
  response — so a failure shows both *what* was judged and *why*.
  """
  @spec failure_message(t()) :: String.t()
  def failure_message(%__MODULE__{} = result) do
    """
    eval #{inspect(result.eval.name)} failed #{count(result)}:

    #{format_failures(failures(result))}

    #{format_transcript(result)}
    """
  end

  defp count(result) do
    failed = length(failures(result))
    total = length(result.checks)
    "#{failed}/#{total} checks"
  end

  defp format_failures(failures) do
    Enum.map_join(failures, "\n\n", &format_failure/1)
  end

  defp format_failure(%Check{name: name, detail: nil}), do: "  ✗ #{name}"
  defp format_failure(%Check{name: name, detail: detail}), do: "  ✗ #{name}\n#{indent(detail)}"

  defp format_transcript(%__MODULE__{eval: eval, transcript: transcript}) do
    lines = [
      "── transcript ──",
      labeled("prompt", eval.prompt),
      "  tools called: #{format_tools(transcript.tool_calls)}",
      transcript_body(transcript)
    ]

    Enum.join(lines, "\n")
  end

  defp transcript_body(%Transcript{status: :ok, response: response}) do
    labeled("response", response)
  end

  defp transcript_body(%Transcript{status: :error, error: error}) do
    "  error: #{inspect(error)}"
  end

  defp transcript_body(%Transcript{status: status}), do: "  status: #{status}"

  defp labeled(label, nil), do: "  #{label}: (none)"
  defp labeled(label, text), do: "  #{label}:\n#{indent(text)}"

  defp format_tools([]), do: "(none)"
  defp format_tools(names), do: Enum.join(names, ", ")

  defp indent(text) do
    text
    |> String.split("\n")
    |> Enum.map_join("\n", &("      " <> &1))
  end
end