Skip to main content

lib/hexpm_mcp/hexdocs.ex

defmodule HexpmMcp.HexDocs do
  @moduledoc """
  Client for fetching and parsing hexdocs.pm content.

  HexDocs has no structured API. This module:
  1. Fetches the search-data JSON (used by the docs JS search)
  2. Falls back to scraping the sidebar HTML for module listings
  3. Converts individual module pages from HTML to markdown
  """

  alias HexpmMcp.Cache

  @base_url "https://hexdocs.pm"

  @doc """
  Get the README content for a package, converted to markdown.
  """
  def get_readme(name, version \\ nil) do
    url = docs_url(name, version, "readme.html")

    Cache.fetch({:readme, name, version}, docs_ttl(), fn ->
      case fetch_html(url) do
        {:ok, html} -> {:ok, html_to_markdown(html)}
        error -> error
      end
    end)
  end

  @doc """
  Get the module listing for a package (table of contents).
  """
  def get_modules(name, version \\ nil) do
    Cache.fetch({:modules, name, version}, docs_ttl(), fn ->
      case fetch_search_data(name, version) do
        {:ok, data} -> {:ok, parse_module_list(data)}
        {:error, _} -> fetch_sidebar_modules(name, version)
      end
    end)
  end

  defp parse_module_list(data) do
    data
    |> Enum.filter(fn item -> item["type"] in ["module", "behaviour", "protocol"] end)
    |> Enum.map(fn item ->
      %{name: item["title"], type: item["type"], doc: item["doc"] || ""}
    end)
    |> Enum.sort_by(& &1.name)
  end

  @doc """
  Get documentation for a specific module or function.
  """
  def get_doc_item(name, module, version \\ nil) do
    path = String.replace(module, ".", "/")

    Cache.fetch({:doc_item, name, module, version}, docs_ttl(), fn ->
      with {:error, _} <- fetch_doc_page(name, version, "#{module}.html") do
        fetch_doc_page(name, version, "#{path}.html")
      end
    end)
  end

  defp fetch_doc_page(name, version, page) do
    case fetch_html(docs_url(name, version, page)) do
      {:ok, html} -> {:ok, html_to_markdown(html)}
      error -> error
    end
  end

  @doc """
  Search within a package's documentation by name.
  """
  def search_docs(name, query, version \\ nil) do
    case fetch_search_data(name, version) do
      {:ok, data} ->
        query_down = String.downcase(query)

        results =
          data
          |> Enum.filter(fn item ->
            title = String.downcase(item["title"] || "")
            doc = String.downcase(item["doc"] || "")
            String.contains?(title, query_down) or String.contains?(doc, query_down)
          end)
          |> Enum.take(20)

        {:ok, results}

      error ->
        error
    end
  end

  # Fetch structured module/function data from the sidebar_items JS file.
  # HexDocs embeds all search data in a `sidebar_items-{hash}.js` file
  # that assigns to `sidebarNodes`. We parse out the JSON.
  defp fetch_search_data(name, version) do
    url = docs_url(name, version, "api-reference.html")

    with {:ok, html} <- fetch_html(url),
         {:ok, sidebar_url} <- extract_sidebar_url(html, name, version),
         {:ok, nodes} <- fetch_sidebar_nodes(sidebar_url) do
      {:ok, flatten_sidebar_nodes(nodes)}
    end
  end

  defp extract_sidebar_url(html, name, version) do
    case Floki.parse_document(html) do
      {:ok, doc} -> find_sidebar_script(doc, name, version)
      _ -> :error
    end
  end

  defp find_sidebar_script(doc, name, version) do
    doc
    |> Floki.find("script[src]")
    |> Enum.find_value(:error, fn {_, attrs, _} ->
      src = get_attr(attrs, "src")
      if String.contains?(src, "sidebar_items"), do: {:ok, absolute_url(src, name, version)}
    end)
  end

  defp absolute_url(src, name, version) do
    if String.starts_with?(src, "http"), do: src, else: docs_url(name, version, src)
  end

  defp fetch_sidebar_nodes(url) do
    case Req.get(url: url, headers: [{"user-agent", "hexpm-mcp"}]) do
      {:ok, %Req.Response{status: 200, body: body}} when is_binary(body) ->
        # Body is: sidebarNodes={...JSON...}
        json_str = String.replace(body, ~r/^sidebarNodes=/, "")

        case Jason.decode(json_str) do
          {:ok, data} -> {:ok, data}
          _ -> {:error, :search_data_unavailable}
        end

      _ ->
        {:error, :search_data_unavailable}
    end
  end

  # Flatten the sidebar nodes structure into a list of items with title/type/doc
  defp flatten_sidebar_nodes(data) when is_map(data) do
    modules = Map.get(data, "modules", [])
    extras = Map.get(data, "extras", [])

    module_items =
      Enum.map(modules, fn mod ->
        %{
          "title" => mod["title"] || mod["id"],
          "type" => classify_module(mod),
          "doc" => extract_node_doc(mod)
        }
      end)

    extra_items =
      Enum.map(extras, fn extra ->
        %{
          "title" => extra["title"] || extra["id"],
          "type" => "extra",
          "doc" => ""
        }
      end)

    module_items ++ extra_items
  end

  defp flatten_sidebar_nodes(_), do: []

  defp classify_module(%{"group" => "Behaviours"}), do: "behaviour"
  defp classify_module(%{"group" => "Protocols"}), do: "protocol"
  defp classify_module(_), do: "module"

  defp extract_node_doc(%{"nodeGroups" => groups}) when is_list(groups) do
    Enum.map_join(groups, ", ", fn g -> "#{g["name"]}: #{length(g["nodes"] || [])}" end)
  end

  defp extract_node_doc(_), do: ""

  defp get_attr(attrs, attr_name) do
    case List.keyfind(attrs, attr_name, 0) do
      {_, value} -> value
      nil -> ""
    end
  end

  defp fetch_sidebar_modules(name, version) do
    url = docs_url(name, version, "api-reference.html")

    with {:ok, html} <- fetch_html(url),
         {:ok, doc} <- Floki.parse_document(html) do
      modules =
        doc
        |> Floki.find("#modules .summary-row a, section.details-list .summary-row a")
        |> Enum.map(fn node ->
          %{
            name: Floki.text(node) |> String.trim(),
            type: "module",
            doc: ""
          }
        end)
        |> Enum.reject(fn m -> m.name == "" end)
        |> Enum.uniq_by(& &1.name)

      {:ok, modules}
    else
      _ -> {:error, :parse_failed}
    end
  end

  defp fetch_html(url) do
    case Req.get(url: url, headers: [{"user-agent", "hexpm-mcp"}]) do
      {:ok, %Req.Response{status: 200, body: body}} when is_binary(body) ->
        {:ok, body}

      {:ok, %Req.Response{status: 404}} ->
        {:error, :not_found}

      {:ok, %Req.Response{status: status}} ->
        {:error, {:http_error, status}}

      {:error, reason} ->
        {:error, {:request_failed, reason}}
    end
  end

  @doc """
  Convert HTML content to a simplified markdown representation.

  Targets the `#content` area of hexdocs pages and strips navigation,
  search, and footer chrome.
  """
  def html_to_markdown(html) when is_binary(html) do
    case Floki.parse_document(html) do
      {:ok, doc} ->
        doc
        |> extract_content()
        |> strip_chrome()
        |> node_to_markdown()
        |> clean_whitespace()

      _ ->
        strip_tags(html)
    end
  end

  defp extract_content(doc) do
    case Floki.find(doc, "#content") do
      [node | _] ->
        node

      [] ->
        case Floki.find(doc, "#moduledoc, .content-inner") do
          [node | _] -> node
          [] -> doc
        end
    end
  end

  @chrome_selectors [
    "nav",
    "#sidebar",
    "#top-content .heading-with-actions .icon-action",
    ".search-input",
    ".autocomplete",
    "footer",
    ".hover-link",
    "#toast",
    "button#sidebar-menu"
  ]

  defp strip_chrome(node) do
    Enum.reduce(@chrome_selectors, node, fn selector, acc ->
      remove_nodes(acc, selector)
    end)
  end

  defp remove_nodes(node, selector) do
    to_remove = Floki.find([node], selector)
    Enum.reduce(to_remove, node, fn target, acc -> remove_node(acc, target) end)
  end

  defp remove_node(nodes, target) when is_list(nodes) do
    Enum.flat_map(nodes, fn node ->
      if node == target, do: [], else: [remove_node(node, target)]
    end)
  end

  defp remove_node({tag, attrs, children}, target) do
    {tag, attrs, remove_node(children, target)}
  end

  defp remove_node(other, _target), do: other

  defp clean_whitespace(text) do
    text
    |> String.replace(~r/\n{3,}/, "\n\n")
    |> String.trim()
  end

  defp strip_tags(html) do
    html
    |> String.replace(~r/<[^>]+>/, " ")
    |> String.replace(~r/\s+/, " ")
    |> String.trim()
  end

  # Node-to-markdown conversion

  defp node_to_markdown(nodes) when is_list(nodes) do
    Enum.map_join(nodes, "", &node_to_markdown/1)
  end

  defp node_to_markdown(text) when is_binary(text), do: text

  defp node_to_markdown({"h1", _attrs, children}) do
    "\n# #{children |> node_to_markdown() |> String.trim()}\n\n"
  end

  defp node_to_markdown({"h2", _attrs, children}) do
    "\n## #{children |> node_to_markdown() |> String.trim()}\n\n"
  end

  defp node_to_markdown({"h3", _attrs, children}) do
    "\n### #{children |> node_to_markdown() |> String.trim()}\n\n"
  end

  defp node_to_markdown({"h4", _attrs, children}) do
    "\n#### #{children |> node_to_markdown() |> String.trim()}\n\n"
  end

  defp node_to_markdown({"p", _attrs, children}) do
    "#{children |> node_to_markdown() |> String.trim()}\n\n"
  end

  defp node_to_markdown({"pre", _attrs, children}) do
    lang = extract_code_lang(children)
    code = children |> node_to_markdown() |> String.trim()
    "\n```#{lang}\n#{code}\n```\n\n"
  end

  defp node_to_markdown({"code", _attrs, children}) do
    "`#{node_to_markdown(children)}`"
  end

  defp node_to_markdown({"strong", _attrs, children}), do: "**#{node_to_markdown(children)}**"
  defp node_to_markdown({"em", _attrs, children}), do: "*#{node_to_markdown(children)}*"

  defp node_to_markdown({"a", attrs, children}) do
    href = get_attr(attrs, "href")
    text = node_to_markdown(children)
    if href == "" or text == "", do: text, else: "[#{text}](#{href})"
  end

  defp node_to_markdown({"ul", _attrs, children}), do: node_to_markdown(children) <> "\n"
  defp node_to_markdown({"ol", _attrs, children}), do: node_to_markdown(children) <> "\n"

  defp node_to_markdown({"li", _attrs, children}) do
    "- #{children |> node_to_markdown() |> String.trim()}\n"
  end

  # Table support
  defp node_to_markdown({"table", _attrs, children}) do
    "\n" <> table_to_markdown(children) <> "\n"
  end

  defp node_to_markdown({"blockquote", _attrs, children}) do
    children
    |> node_to_markdown()
    |> String.trim()
    |> String.split("\n")
    |> Enum.map_join("\n", &"> #{&1}")
    |> Kernel.<>("\n\n")
  end

  defp node_to_markdown({"br", _attrs, _children}), do: "\n"
  defp node_to_markdown({"hr", _attrs, _children}), do: "\n---\n\n"

  # Skip chrome elements
  defp node_to_markdown({"script", _attrs, _children}), do: ""
  defp node_to_markdown({"style", _attrs, _children}), do: ""
  defp node_to_markdown({"nav", _attrs, _children}), do: ""
  defp node_to_markdown({"footer", _attrs, _children}), do: ""
  defp node_to_markdown({"button", _attrs, _children}), do: ""
  defp node_to_markdown({"input", _attrs, _children}), do: ""

  # Section and div -- pass through to children
  defp node_to_markdown({_tag, _attrs, children}), do: node_to_markdown(children)

  defp node_to_markdown(_), do: ""

  # Table conversion helpers

  defp table_to_markdown(nodes) do
    rows =
      nodes
      |> Floki.find("tr")
      |> Enum.map(fn {"tr", _, cells} ->
        Enum.map(cells, fn {_tag, _attrs, children} ->
          children |> node_to_markdown() |> String.trim()
        end)
      end)

    case rows do
      [] -> ""
      [header | data] -> format_md_table(header, data)
    end
  end

  defp format_md_table(header, data) do
    separator = Enum.map(header, fn _ -> "---" end)

    [header, separator | data]
    |> Enum.map_join("\n", fn row -> "| " <> Enum.join(row, " | ") <> " |" end)
    |> Kernel.<>("\n")
  end

  @known_langs ~w(elixir erlang html shell bash json sql javascript ruby python)

  defp extract_code_lang([{"code", attrs, _} | _]) do
    class = get_attr(attrs, "class")
    Enum.find(@known_langs, "", &String.contains?(class, &1))
  end

  defp extract_code_lang(_), do: ""

  # Build a hexdocs URL. When version is nil, omit it (hexdocs serves latest at /pkg/page.html).
  # When version is set, include it: /pkg/version/page.html
  defp docs_url(name, nil, page), do: "#{base_url()}/#{name}/#{page}"
  defp docs_url(name, version, page), do: "#{base_url()}/#{name}/#{version}/#{page}"

  defp base_url, do: Application.get_env(:hexpm_mcp, :hexdocs_url, @base_url)

  defp docs_ttl, do: Application.get_env(:hexpm_mcp, :docs_cache_ttl, 3600)
end