examples/streaming.exs

# examples/streaming.exs
#
# True streaming TTS — get audio chunks as they're generated.
#
# Usage:
#   mix run examples/streaming.exs
#   mix run examples/streaming.exs --text "Long text for streaming"
#   mix run examples/streaming.exs --text "你好世界" --device cpu

{opts, _args, _invalid} =
  OptionParser.parse(System.argv(),
    switches: [
      text: :string,
      device: :string,
      output: :string,
      steps: :integer,
      cfg: :float
    ]
  )

text = opts[:text] || "Streaming synthesis delivers audio chunks as they are generated. " <>
                      "This means you can start playback immediately, " <>
                      "without waiting for the entire utterance to complete."
device = opts[:device] || "cuda"
output = opts[:output] || "streaming.wav"
steps = opts[:steps] || 10
cfg = opts[:cfg] || 2.0

IO.puts("==> Streaming TTS")
IO.puts("==> Text: #{String.slice(text, 0, 60)}...")
IO.puts("==> Device: #{device}")

{:ok, pid} = VoxCPMEx.start_link(device: device)
:ok = VoxCPMEx.await_ready(pid, 120_000)
IO.puts("==> Model ready!")

# Start async streaming
IO.puts("==> Starting stream...")
{:ok, ref} = VoxCPMEx.generate_streaming_async(pid, text,
  inference_timesteps: steps,
  cfg_value: cfg
)

# Collect chunks
chunks =
  Stream.unfold(ref, fn ref ->
    case VoxCPMEx.next_chunk(pid, ref) do
      {:ok, chunk} ->
        IO.puts("  chunk ##{length([])}: #{byte_size(chunk)} bytes")
        # For real-time playback, you'd send chunk to audio output here
        {chunk, ref}
      :eos ->
        IO.puts("  stream complete")
        nil
      {:error, reason} ->
        IO.puts(:stderr, "  error: #{reason}")
        nil
    end
  end)
  |> Enum.to_list()

IO.puts("==> Total chunks: #{length(chunks)}")
IO.puts("==> Done! ⚡")