lib/foundry/context/spec_kit_index_builder.ex

Select File:
lib/foundry/context/spec_kit_index_builder.ex

defmodule Foundry.Context.SpecKitIndexBuilder do
  @moduledoc """
  Builds the spec-kit index by scanning ADRs, runbooks, findings, regulations, and other
  documentation files. Extracts summaries, tags, and token counts.

  Token estimation uses a conservative approximation: byte_size / 3.
  English prose + Elixir identifiers average ~3.5 chars/token; dividing by 3
  slightly overestimates (correct for warning thresholds).
  """

  @excluded_files ~w[
    docs/project_context_schema.md
    docs/spec_kit_index_schema.md
    docs/mix_task_summary_schemas.md
    docs/reference-project-fixture.md
    docs/manifest-schema-draft.md
  ]

  @stop_words ~w[the a an is are for with by in on at to of and or not this that
                 it its be as from will must when if all any each per no]

  @spec build(String.t()) :: map()
  def build(project_root) do
    adrs = scan_dir(project_root, "docs/adrs/", "adr")
    runbooks = scan_dir(project_root, "docs/runbooks/", "runbook")
    findings = scan_dir(project_root, "docs/findings/", "finding")
    regs = scan_dir(project_root, "docs/regulations/", "regulation")
    agents = scan_exact(project_root, "AGENTS.md", "agents", "AGENTS")
    rules = scan_dir(project_root, ".foundry/usage_rules/", "usage_rules")

    all_entries = adrs ++ runbooks ++ findings ++ regs ++ List.wrap(agents) ++ rules
    token_count = estimate_tokens(all_entries)

    %{
      "index_token_count" => token_count,
      "index_token_warn" => token_count > 45000,
      "index_token_limit" => 50000,
      "adrs" => adrs,
      "runbooks" => runbooks,
      "findings" => findings,
      "regulations" => regs,
      "agents" => agents,
      "usage_rules" => rules
    }
  end

  # ---------------------------------------------------------------------------
  # Directory scanning
  # ---------------------------------------------------------------------------

  defp scan_dir(project_root, dir_path, doc_type) do
    full_path = Path.join(project_root, dir_path)

    case File.ls(full_path) do
      {:ok, files} ->
        files
        |> Enum.filter(&String.ends_with?(&1, ".md"))
        |> Enum.map(&Path.join(dir_path, &1))
        |> Enum.reject(&(&1 in @excluded_files))
        |> Enum.map(&extract_document(project_root, &1, doc_type))
        |> Enum.filter(&(&1 != nil))

      {:error, _} ->
        []
    end
  end

  defp scan_exact(project_root, file_path, doc_type, title) do
    full_path = Path.join(project_root, file_path)

    case File.read(full_path) do
      {:ok, content} ->
        [extract_document_from_content(file_path, content, doc_type, title)]

      {:error, _} ->
        nil
    end
    |> List.wrap()
    |> Enum.filter(&(&1 != nil))
  end

  # ---------------------------------------------------------------------------
  # Document extraction
  # ---------------------------------------------------------------------------

  defp extract_document(project_root, path, doc_type) do
    case File.read(Path.join(project_root, path)) do
      {:ok, content} ->
        # Extract title from filename (e.g., "ADR-001-stack-selection.md" → "ADR-001")
        title =
          path
          |> Path.basename(".md")
          |> String.slice(0, String.length(Path.basename(path)) - 3)

        extract_document_from_content(path, content, doc_type, title)

      {:error, _} ->
        nil
    end
  end

  defp extract_document_from_content(path, content, doc_type, title) do
    id = extract_doc_id(path, content)
    title = extract_heading_title(content) || title
    summary = extract_summary(content)
    tags = extract_tags(content)
    status = extract_metadata_field(content, "Status")
    date = extract_metadata_field(content, "Date")

    %{
      id: id,
      path: path,
      title: normalize_title(title, id),
      type: doc_type,
      status: status,
      date: date,
      summary: summary,
      tags: tags
    }
  end

  defp extract_doc_id(path, content) do
    with nil <- extract_adr_id(content),
         nil <- extract_adr_id(path) do
      path
      |> Path.basename(".md")
    else
      id -> id
    end
  end

  defp extract_adr_id(text) do
    case Regex.run(~r/\bADR-\d{3}\b/i, text || "") do
      [id] -> String.upcase(id)
      _ -> nil
    end
  end

  defp extract_heading_title(content) do
    case Regex.run(~r/^#\s+(.+)$/m, content) do
      [_, heading] -> String.trim(heading)
      _ -> nil
    end
  end

  defp normalize_title(nil, _id), do: nil

  defp normalize_title(title, nil), do: String.trim(title)

  defp normalize_title(title, id) do
    title
    |> String.trim()
    |> String.replace(~r/^#{Regex.escape(id)}\s*[:\-]\s*/i, "")
    |> String.trim()
  end

  defp extract_metadata_field(content, field) do
    pattern = ~r/^\*\*#{Regex.escape(field)}:\*\*\s*(.+?)\s*$/mi

    case Regex.run(pattern, content) do
      [_, value] -> String.trim(value)
      _ -> nil
    end
  end

  # ---------------------------------------------------------------------------
  # Summary and tag extraction
  # ---------------------------------------------------------------------------

  defp extract_summary(content) do
    content
    |> String.split("\n\n")
    |> Enum.reject(&skip_paragraph?/1)
    |> List.first()
    |> truncate_summary()
  end

  defp skip_paragraph?(text) do
    trimmed = String.trim(text)

    trimmed == "" or
      String.starts_with?(trimmed, "#") or
      String.starts_with?(trimmed, "```") or
      String.starts_with?(trimmed, ">") or
      String.starts_with?(trimmed, "---") or
      String.starts_with?(trimmed, "**")
  end

  defp truncate_summary(nil), do: nil

  defp truncate_summary(text) do
    result =
      text
      |> String.split(~r/(?<=[.!?])\s+/, parts: 3)
      |> Enum.take(2)
      |> Enum.join(" ")

    if String.length(result) > 300 do
      String.slice(result, 0, 297) <> "..."
    else
      result
    end
  end

  defp extract_tags(text) do
    explicit_tags =
      case Regex.run(~r/^\*\*Tags:\*\*\s*(.+?)\s*$/mi, text) do
        [_, tags] ->
          tags
          |> String.split(~r/[,\|]/, trim: true)
          |> Enum.map(&String.trim/1)
          |> Enum.reject(&(&1 == ""))

        _ ->
          []
      end

    if explicit_tags != [] do
      explicit_tags
    else
      text
      |> String.downcase()
      |> String.replace(~r/[^a-z0-9\s]/, " ")
      |> String.split(~r/\s+/, trim: true)
      |> Enum.reject(&(&1 in @stop_words))
      |> Enum.reject(&(String.length(&1) < 3))
      |> Enum.uniq()
      |> Enum.sort()
      |> Enum.take(12)
    end
  end

  # ---------------------------------------------------------------------------
  # Token counting
  # ---------------------------------------------------------------------------

  defp estimate_tokens(entries) do
    entries |> Jason.encode!() |> byte_size() |> div(3)
  end
end