Skip to main content

lib/agent_sea/voice/openai.ex

defmodule AgentSea.Voice.OpenAI do
  @moduledoc """
  OpenAI voice adapter: text-to-speech (`/v1/audio/speech`) and speech-to-text
  (`/v1/audio/transcriptions`) over `Req`.

  Options: `:api_key` (defaults to `OPENAI_API_KEY`), `:model`, `:voice`,
  `:format`, `:base_url`, and `:adapter` (a Req adapter, used to stub HTTP in
  tests).
  """

  @behaviour AgentSea.Voice.TTS
  @behaviour AgentSea.Voice.STT

  @base_url "https://api.openai.com"

  @impl AgentSea.Voice.TTS
  def synthesize(text, opts) do
    body = %{
      model: opts[:model] || "tts-1",
      input: text,
      voice: opts[:voice] || "alloy",
      response_format: opts[:format] || "mp3"
    }

    case Req.post(req(opts), url: "/v1/audio/speech", json: body) do
      {:ok, %Req.Response{status: 200, body: audio}} when is_binary(audio) ->
        {:ok, %{audio: audio, format: opts[:format] || "mp3"}}

      {:ok, %Req.Response{status: status, body: resp}} ->
        {:error, {:http_error, status, resp}}

      {:error, reason} ->
        {:error, reason}
    end
  end

  @impl AgentSea.Voice.STT
  def transcribe(audio, opts) do
    multipart = [
      model: opts[:model] || "whisper-1",
      file: {audio, filename: "audio.mp3", content_type: "audio/mpeg"}
    ]

    case Req.post(req(opts), url: "/v1/audio/transcriptions", form_multipart: multipart) do
      {:ok, %Req.Response{status: 200, body: %{"text" => text}}} ->
        {:ok, %{text: text}}

      {:ok, %Req.Response{status: 200, body: body}} when is_binary(body) ->
        decode_text(body)

      {:ok, %Req.Response{status: status, body: resp}} ->
        {:error, {:http_error, status, resp}}

      {:error, reason} ->
        {:error, reason}
    end
  end

  defp req(opts) do
    api_key = opts[:api_key] || System.get_env("OPENAI_API_KEY")

    [
      base_url: opts[:base_url] || @base_url,
      headers: [{"authorization", "Bearer #{api_key || ""}"}]
    ]
    |> maybe_put(:adapter, opts[:adapter])
    |> Req.new()
  end

  defp maybe_put(kw, _key, nil), do: kw
  defp maybe_put(kw, key, value), do: Keyword.put(kw, key, value)

  defp decode_text(body) do
    case Jason.decode(body) do
      {:ok, %{"text" => text}} -> {:ok, %{text: text}}
      _ -> {:error, :invalid_response}
    end
  end
end