Skip to main content

lib/cmdc_eval/suites/bfcl.ex

defmodule CMDCEval.Suites.BFCL do
  @moduledoc """
  Berkeley Function Calling Leaderboard v3 (BFCL) Suite —— 占位实现 + fixtures 接入路径。

  ## 数据来源

  BFCL fixtures 来自上游公开仓库:
  <https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard>

  v0.1 实现:

  - **fixtures 文件存放**:`priv/bfcl/v3/<category>.jsonl`(gitignore 不入 hex 包)
  - **`mix cmdc.eval.fetch_bfcl`**:从上游公开仓库 `git clone` + 转换格式
  - **`cases/0` 自动 load**:若 fixtures 目录不存在,返回空列表 + warning
  - **`assert/2`**:按 BFCL ground truth 比对(v0.1 只比 tool name + 主要参数)

  > v0.1 故意保持小范围:10 个用例占位 + 完整 fetch path。
  > 更复杂的 sub-category (AST / executable / multi_turn 等) 留 v0.2 扩展。

  ## 限制

  - 不内置 fixtures(许可证 + 文件大小考虑)
  - 跑前需手动执行 `mix cmdc.eval.fetch_bfcl`
  - v0.1 只覆盖 BFCL "simple" 子类(v1.0 公开 fixtures)

  ## 替代方案

  如果 fixtures fetch 失败,可用 `CMDCEval.Suites.Internal` 做 cmdc kernel
  自验证;BFCL 主要用于跨模型 / 跨 Agent kernel 横向比较。
  """

  require Logger

  @behaviour CMDCEval.Suite

  alias CMDCEval.Case

  @impl true
  def name, do: "bfcl"

  @impl true
  def cases do
    fixtures_dir = Application.get_env(:cmdc_eval, :bfcl_fixtures_dir, "priv/bfcl/v3")

    case load_fixtures(fixtures_dir) do
      {:ok, cases} ->
        cases

      {:error, :no_fixtures} ->
        Logger.warning(
          "[CMDCEval.Suites.BFCL] fixtures 不存在 (#{fixtures_dir}),返回空 cases 列表;" <>
            "运行 `mix cmdc.eval.fetch_bfcl` 拉取 upstream fixtures"
        )

        []
    end
  end

  @impl true
  def assert(%Case{expected: %{tool_called: name}}, reply) when is_binary(reply) do
    # v0.1 简化:仅比对 reply 文本中是否提到该工具名(真 BFCL 需要 trace 工具调用)
    String.contains?(String.downcase(reply), String.downcase(name))
  end

  def assert(%Case{expected: %{contains: text}}, reply) when is_binary(reply) do
    String.contains?(reply, text)
  end

  def assert(_, _), do: false

  @impl true
  def default_tools, do: []

  @impl true
  def cost_estimator(%{tokens_in: ti, tokens_out: to}) do
    Float.round((ti * 3.0 + to * 15.0) / 1_000_000.0, 6)
  end

  # ==========================================================================
  # 私有 — fixtures 加载
  # ==========================================================================

  defp load_fixtures(dir) do
    expanded = Path.expand(dir)

    if File.dir?(expanded) do
      cases =
        expanded
        |> File.ls!()
        |> Enum.filter(&String.ends_with?(&1, ".jsonl"))
        |> Enum.flat_map(fn fname -> parse_jsonl(Path.join(expanded, fname)) end)

      if cases == [], do: {:error, :no_fixtures}, else: {:ok, cases}
    else
      {:error, :no_fixtures}
    end
  end

  defp parse_jsonl(path) do
    path
    |> File.stream!()
    |> Stream.map(&String.trim/1)
    |> Stream.reject(&(&1 == ""))
    |> Stream.map(&Jason.decode!/1)
    |> Stream.map(&row_to_case/1)
    |> Enum.to_list()
  end

  defp row_to_case(%{"id" => id, "question" => q} = row) do
    expected =
      cond do
        Map.has_key?(row, "function") -> %{tool_called: row["function"]}
        Map.has_key?(row, "ground_truth") -> %{contains: row["ground_truth"]}
        true -> nil
      end

    Case.new(id: id, input: q, expected: expected, metadata: row)
  end

  defp row_to_case(_), do: nil
end