Skip to main content

lib/agent_sea/ingest/chunker.ex

defmodule AgentSea.Ingest.Chunker do
  @moduledoc """
  Splits text into overlapping word windows. Overlap preserves context across
  chunk boundaries (so a fact split between two chunks still embeds coherently
  in at least one).
  """

  @default_size 120
  @default_overlap 20

  @doc """
  Chunk `text` into word windows.

  Options: `:size` (words per chunk, default #{@default_size}) and `:overlap`
  (words shared with the previous chunk, default #{@default_overlap}).
  """
  @spec chunk(String.t(), keyword()) :: [String.t()]
  def chunk(text, opts \\ []) do
    size = Keyword.get(opts, :size, @default_size)
    overlap = Keyword.get(opts, :overlap, @default_overlap)
    step = max(size - overlap, 1)

    text
    |> String.split(~r/\s+/u, trim: true)
    |> windows(size, step, [])
  end

  defp windows([], _size, _step, acc), do: Enum.reverse(acc)

  defp windows(words, size, step, acc) do
    chunk = words |> Enum.take(size) |> Enum.join(" ")
    acc = [chunk | acc]

    if length(words) <= size do
      Enum.reverse(acc)
    else
      windows(Enum.drop(words, step), size, step, acc)
    end
  end
end