defmodule Mix.Tasks.Cmdc.Eval do
@shortdoc "跑 cmdc_eval Suite + 输出 JSONL 报告"
@moduledoc """
跑 cmdc_eval Suite + 输出 JSONL 报告。
## 用法
$ mix cmdc.eval --suite=internal --model="anthropic:claude-sonnet-4-5"
$ mix cmdc.eval --suite=bfcl --model="openai:gpt-4o" --report=bfcl.jsonl
$ mix cmdc.eval --suite=internal --concurrency=8 --timeout=120000
## 选项
- `--suite=<name>` 必填。可选值:`internal` / `bfcl` / 任意完整模块名
(如 `Elixir.CMDCEval.Suites.Internal`)
- `--model=<id>` 必填。如 `anthropic:claude-sonnet-4-5`
- `--report=<path>` 可选。JSONL 报告输出路径(默认不写文件)
- `--concurrency=<n>` 可选。并发跑 case 数(默认 4)
- `--timeout=<ms>` 可选。单 case 超时毫秒数(默认 60000)
## 退出码
- `0` —— 所有 case pass
- `1` —— 有 case 失败
- `2` —— Suite 无 case(如 BFCL fixtures 未 fetch)
- `3` —— Suite 模块不存在或非法
"""
use Mix.Task
@impl Mix.Task
def run(argv) do
{opts, _, _} =
OptionParser.parse(argv,
strict: [
suite: :string,
model: :string,
report: :string,
concurrency: :integer,
timeout: :integer
]
)
suite_name = Keyword.get(opts, :suite) || raise_usage("missing --suite=<name>")
model = Keyword.get(opts, :model) || raise_usage("missing --model=<id>")
suite_module = resolve_suite(suite_name)
if is_nil(suite_module) do
Mix.shell().error("Suite 不存在: #{suite_name}")
exit({:shutdown, 3})
end
Mix.Task.run("app.start")
run_opts =
[suite: suite_module, model: model]
|> maybe_put(:report_path, Keyword.get(opts, :report))
|> maybe_put(:concurrency, Keyword.get(opts, :concurrency))
|> maybe_put(:timeout_ms, Keyword.get(opts, :timeout))
case CMDCEval.run(run_opts) do
{:ok, report} ->
print_summary(report)
cond do
report.summary.total == 0 -> exit({:shutdown, 2})
report.summary.fail > 0 -> exit({:shutdown, 1})
true -> :ok
end
{:error, reason} ->
Mix.shell().error("评测失败: #{inspect(reason)}")
exit({:shutdown, 3})
end
end
defp resolve_suite("internal"), do: CMDCEval.Suites.Internal
defp resolve_suite("bfcl"), do: CMDCEval.Suites.BFCL
defp resolve_suite(other) do
mod = String.to_atom("Elixir." <> other)
if Code.ensure_loaded?(mod), do: mod, else: nil
end
defp maybe_put(kw, _key, nil), do: kw
defp maybe_put(kw, key, val), do: Keyword.put(kw, key, val)
defp print_summary(report) do
s = report.summary
Mix.shell().info("""
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Suite: #{report.suite_name}
Model: #{report.model}
Cases: #{s.total}
Pass: #{s.pass} (rate=#{s.pass_rate})
Fail: #{s.fail}
Latency: avg=#{s.avg_latency_ms}ms total=#{s.total_latency_ms}ms
Tokens: in=#{s.total_tokens_in} out=#{s.total_tokens_out}
Cost: $#{s.total_cost_usd}
Report: #{report.report_path || "<not written>"}
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
""")
end
defp raise_usage(msg) do
Mix.shell().error(
"Usage: mix cmdc.eval --suite=<name> --model=<id> [--report=<path>] " <>
"[--concurrency=<n>] [--timeout=<ms>]"
)
raise msg
end
end