examples/voice_cloning.exs

# examples/voice_cloning.exs
#
# Voice Cloning example — clone a voice from reference audio.
#
# Usage:
#   mix run examples/voice_cloning.exs --reference path/to/voice.wav
#   mix run examples/voice_cloning.exs --reference speaker.wav --text "你好世界"
#   mix run examples/voice_cloning.exs --reference speaker.wav --text "Bonjour!" --device mps
#
# For ultimate cloning (highest fidelity), also provide the transcript:
#   mix run examples/voice_cloning.exs --reference speaker.wav --prompt-text "transcript here"

{opts, _args, _invalid} =
  OptionParser.parse(System.argv(),
    switches: [
      reference: :string,
      text: :string,
      prompt_text: :string,
      device: :string,
      output: :string,
      steps: :integer,
      cfg: :float,
      denoise: :boolean
    ]
  )

reference = opts[:reference]
unless reference do
  IO.puts(:stderr, "Error: --reference path/to/voice.wav is required")
  System.halt(1)
end

unless File.exists?(reference) do
  IO.puts(:stderr, "Error: reference audio not found: #{reference}")
  System.halt(1)
end

text = opts[:text] || "This is a cloned voice generated by VoxCPM2 through Elixir."
device = opts[:device] || "cuda"
output = opts[:output] || "voice_clone.wav"
steps = opts[:steps] || 10
cfg = opts[:cfg] || 2.0
denoise = opts[:denoise] || false
prompt_text = opts[:prompt_text]

IO.puts("==> Voice Cloning")
IO.puts("==> Reference: #{reference}")
IO.puts("==> Text: #{text}")
IO.puts("==> Device: #{device}")

# Need denoiser loaded for --denoise
{:ok, pid} = VoxCPMEx.start_link(device: device, load_denoiser: denoise)
:ok = VoxCPMEx.await_ready(pid, 120_000)
IO.puts("==> Model ready!")

# Build generation options
gen_opts = [
  audio_prompt: reference,
  inference_timesteps: steps,
  cfg_value: cfg,
  denoise: denoise
]

# Ultimate cloning: add prompt_wav + prompt_text
gen_opts =
  if prompt_text do
    IO.puts("==> Ultimate cloning mode (with transcript)")
    [{:prompt_wav_path, reference}, {:prompt_text, prompt_text} | gen_opts]
  else
    gen_opts
  end

{:ok, audio} = VoxCPMEx.generate(pid, text, gen_opts)

:ok = VoxCPMEx.save(audio, output)
IO.puts("==> Audio saved to #{output} (#{byte_size(audio)} bytes)")
IO.puts("==> Done! 🎙️")