lib/agent_sea/evaluate/metric/llm_judge.ex

defmodule AgentSea.Evaluate.Metric.LLMJudge do
  @moduledoc """
  Uses an LLM to score an output against a rubric — "LLM-as-judge". Runs over any
  `AgentSea.Provider` (so it can go through the gateway).

  Options:
    * `:provider`  — `{module, opts}` (required)
    * `:model`     — model id (or in the provider opts)
    * `:rubric`    — grading instructions (default: relevance/correctness)
    * `:threshold` — pass cutoff in [0,1] (default 0.5)
  """

  @behaviour AgentSea.Evaluate.Metric

  @default_rubric "Rate how well the response satisfies the request and matches the expected answer."

  @impl true
  def name, do: "llm_judge"

  @impl true
  def evaluate(example, opts) do
    {provider_mod, provider_opts} = Keyword.fetch!(opts, :provider)
    model = Keyword.get(opts, :model) || Keyword.get(provider_opts, :model)
    threshold = Keyword.get(opts, :threshold, 0.5)

    messages = judge_messages(example, Keyword.get(opts, :rubric, @default_rubric))

    case provider_mod.complete(messages, Keyword.put(provider_opts, :model, model)) do
      {:ok, response} ->
        score = parse_score(response.content)
        %{score: score, passed: score >= threshold}

      {:error, _reason} ->
        %{score: 0.0, passed: false}
    end
  end

  defp judge_messages(example, rubric) do
    system =
      "You are a strict evaluator. #{rubric} " <>
        "Respond with ONLY a single number between 0 and 1 (1 = perfect)."

    user =
      """
      Input: #{Map.get(example, :input, "")}
      Expected: #{inspect(Map.get(example, :expected))}
      Response: #{example.output}

      Score:
      """

    [%{role: :system, content: system}, %{role: :user, content: user}]
  end

  defp parse_score(content) do
    case Regex.run(~r/-?\d+(?:\.\d+)?/, content) do
      [number] ->
        case Float.parse(number) do
          {value, _rest} -> value |> max(0.0) |> min(1.0)
          :error -> 0.0
        end

      _ ->
        0.0
    end
  end
end