lib/omni/tools/web_fetch.ex

defmodule Omni.Tools.WebFetch do
  @moduledoc """
  An `Omni.Tool` for fetching and simplifying web content.

  Fetches one or more URLs, extracts content appropriate for LLM
  consumption (HTML to Markdown, pretty-printed JSON, plain text
  passthrough), and returns the results as a string.

      tool = Omni.Tools.WebFetch.new()
      tool = Omni.Tools.WebFetch.new(max_output: 30_000, timeout: 10_000)

  ## Strategies

  Content extraction is handled by pluggable strategies. Each strategy
  implements `Omni.Tools.WebFetch.Strategy` and declares which URLs it
  handles via `match?/2`. Strategies are tried in order — the first
  match wins.

  Three strategies are always active, appended after any user-provided
  strategies:

  - **GitHub** — matches `github.com` blob URLs, redirects to
    `raw.githubusercontent.com` for direct file content.
  - **Reddit** — matches `*.reddit.com`, fetches via Reddit's JSON API,
    formats posts and comments as readable Markdown.
  - **Default** — catch-all that handles HTML (→ Markdown), JSON
    (→ pretty-printed), plain text (→ passthrough), and binary (→ metadata).

  Custom strategies are prepended, so they take priority over the
  built-ins. To override the built-in GitHub handling, for example,
  provide your own strategy that matches `github.com` first:

      tool = Omni.Tools.WebFetch.new(strategies: [{MyApp.WikiStrategy, []}])

  ## Custom Req

  Pass a pre-configured `Req.Request` struct to control the HTTP
  transport. This is useful for attaching middleware, setting
  authentication, or replacing the transport layer entirely.

      req = Req.new() |> MyApp.Auth.attach()
      tool = Omni.Tools.WebFetch.new(req: req)

  ## Options

  - `:req` — base `Req.Request` struct. Default: `Req.new()`.
  - `:strategies` — list of strategy modules or `{module, opts}` tuples.
    Default: `[]`.
  - `:max_output` — per-URL content truncation limit in bytes. Set to
    `:infinity` to disable truncation. Default: `100_000`.
  - `:max_urls` — maximum number of URLs per batch call. Default: `10`.
  - `:timeout` — HTTP receive timeout in milliseconds. Default: `15_000`.
  """

  use Omni.Tool, name: "web_fetch"

  alias Omni.Tools.WebFetch.{Fetcher, Strategy}
  alias Omni.Tools.WebFetch.Strategy.{Default, GitHub, Reddit}

  @defaults [
    strategies: [],
    max_output: 100_000,
    max_urls: 10,
    timeout: 15_000
  ]

  @impl Omni.Tool
  def init(opts) do
    opts =
      @defaults
      |> Keyword.merge(Application.get_env(:omni_tools, __MODULE__, []))
      |> Keyword.merge(opts || [])

    req = Keyword.get(opts, :req, Req.new())

    unless is_struct(req, Req.Request) do
      raise ArgumentError, ":req must be a %Req.Request{} struct, got: #{inspect(req)}"
    end

    strategies =
      opts
      |> Keyword.fetch!(:strategies)
      |> Strategy.resolve()
      |> Kernel.++([{GitHub, []}, {Reddit, []}, {Default, []}])

    [
      req: req,
      strategies: strategies,
      max_output: Keyword.fetch!(opts, :max_output),
      max_urls: Keyword.fetch!(opts, :max_urls),
      timeout: Keyword.fetch!(opts, :timeout)
    ]
  end

  @impl Omni.Tool
  def schema(_state) do
    import Omni.Schema

    object(
      %{
        url: string(description: "URL to fetch"),
        urls: array(string(), description: "Multiple URLs to fetch concurrently")
      },
      required: []
    )
  end

  @impl Omni.Tool
  def description(state) do
    max_urls = Keyword.fetch!(state, :max_urls)
    max_output = Keyword.fetch!(state, :max_output)

    truncation_line =
      case max_output do
        :infinity -> nil
        bytes -> "- Content is truncated to ~#{div(bytes, 1_000)}KB per URL"
      end

    output_lines =
      [
        "- HTML pages are converted to Markdown with boilerplate removed",
        "- JSON responses are pretty-printed",
        truncation_line,
        "- Batch results are separated with URL headers"
      ]
      |> Enum.reject(&is_nil/1)
      |> Enum.join("\n")

    """
    Fetch content from web URLs.

    ## Capabilities
    - Fetches web pages and converts HTML to clean Markdown
    - Fetches JSON APIs and returns pretty-printed JSON
    - Handles plain text, with metadata for binary formats (PDF, images, etc.)
    - Supports single URL or batch fetch (up to #{max_urls} URLs)

    ## Parameters
    - `url` — single URL to fetch (string)
    - `urls` — array of URLs to fetch concurrently (max #{max_urls})
    - Provide exactly one of `url` or `urls`

    ## Output
    #{output_lines}

    ## Limitations
    - Binary formats (PDF, DOCX, images) return metadata only, not extracted text
    - Some sites may block automated requests\
    """
  end

  @impl Omni.Tool
  def call(input, state) do
    urls = resolve_urls(input, state)
    Fetcher.fetch(urls, Keyword.fetch!(state, :strategies), state)
  end

  # ── URL resolution ───────────────────────────────────────────────

  defp resolve_urls(input, state) do
    urls =
      case input do
        %{url: url, urls: urls}
        when is_binary(url) and url != "" and is_list(urls) and urls != [] ->
          Enum.uniq([url | urls])

        %{urls: urls} when is_list(urls) and urls != [] ->
          urls

        %{url: url} when is_binary(url) and url != "" ->
          [url]

        _ ->
          raise "provide either `url` (string) or `urls` (array of strings)"
      end

    max = Keyword.fetch!(state, :max_urls)

    if length(urls) > max do
      raise "too many URLs: #{length(urls)} (max #{max})"
    end

    urls
  end
end