Skip to main content

lib/mix/tasks/cmdc.eval.ex

defmodule Mix.Tasks.Cmdc.Eval do
  @shortdoc "跑 cmdc_eval Suite + 输出 JSONL 报告"

  @moduledoc """
  跑 cmdc_eval Suite + 输出 JSONL 报告。

  ## 用法

      $ mix cmdc.eval --suite=internal --model="anthropic:claude-sonnet-4-5"
      $ mix cmdc.eval --suite=bfcl --model="openai:gpt-4o" --report=bfcl.jsonl
      $ mix cmdc.eval --suite=internal --concurrency=8 --timeout=120000

  ## 选项

  - `--suite=<name>` 必填。可选值:`internal` / `bfcl` / 任意完整模块名
    (如 `Elixir.CMDCEval.Suites.Internal`)
  - `--model=<id>` 必填。如 `anthropic:claude-sonnet-4-5`
  - `--report=<path>` 可选。JSONL 报告输出路径(默认不写文件)
  - `--concurrency=<n>` 可选。并发跑 case 数(默认 4)
  - `--timeout=<ms>` 可选。单 case 超时毫秒数(默认 60000)

  ## 退出码

  - `0` —— 所有 case pass
  - `1` —— 有 case 失败
  - `2` —— Suite 无 case(如 BFCL fixtures 未 fetch)
  - `3` —— Suite 模块不存在或非法
  """

  use Mix.Task

  @impl Mix.Task
  def run(argv) do
    {opts, _, _} =
      OptionParser.parse(argv,
        strict: [
          suite: :string,
          model: :string,
          report: :string,
          concurrency: :integer,
          timeout: :integer
        ]
      )

    suite_name = Keyword.get(opts, :suite) || raise_usage("missing --suite=<name>")
    model = Keyword.get(opts, :model) || raise_usage("missing --model=<id>")

    suite_module = resolve_suite(suite_name)

    if is_nil(suite_module) do
      Mix.shell().error("Suite 不存在: #{suite_name}")
      exit({:shutdown, 3})
    end

    Mix.Task.run("app.start")

    run_opts =
      [suite: suite_module, model: model]
      |> maybe_put(:report_path, Keyword.get(opts, :report))
      |> maybe_put(:concurrency, Keyword.get(opts, :concurrency))
      |> maybe_put(:timeout_ms, Keyword.get(opts, :timeout))

    case CMDCEval.run(run_opts) do
      {:ok, report} ->
        print_summary(report)

        cond do
          report.summary.total == 0 -> exit({:shutdown, 2})
          report.summary.fail > 0 -> exit({:shutdown, 1})
          true -> :ok
        end

      {:error, reason} ->
        Mix.shell().error("评测失败: #{inspect(reason)}")
        exit({:shutdown, 3})
    end
  end

  defp resolve_suite("internal"), do: CMDCEval.Suites.Internal
  defp resolve_suite("bfcl"), do: CMDCEval.Suites.BFCL

  defp resolve_suite(other) do
    mod = String.to_atom("Elixir." <> other)
    if Code.ensure_loaded?(mod), do: mod, else: nil
  end

  defp maybe_put(kw, _key, nil), do: kw
  defp maybe_put(kw, key, val), do: Keyword.put(kw, key, val)

  defp print_summary(report) do
    s = report.summary

    Mix.shell().info("""

    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    Suite:   #{report.suite_name}
    Model:   #{report.model}
    Cases:   #{s.total}
    Pass:    #{s.pass}  (rate=#{s.pass_rate})
    Fail:    #{s.fail}
    Latency: avg=#{s.avg_latency_ms}ms total=#{s.total_latency_ms}ms
    Tokens:  in=#{s.total_tokens_in} out=#{s.total_tokens_out}
    Cost:    $#{s.total_cost_usd}
    Report:  #{report.report_path || "<not written>"}
    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
    """)
  end

  defp raise_usage(msg) do
    Mix.shell().error(
      "Usage: mix cmdc.eval --suite=<name> --model=<id> [--report=<path>] " <>
        "[--concurrency=<n>] [--timeout=<ms>]"
    )

    raise msg
  end
end