lib/huggingface_client/hub/datasets/viewer.ex

defmodule HuggingfaceClient.Hub.DatasetViewer do
  @moduledoc """
  HuggingFace Dataset Viewer API.

  Provides programmatic access to dataset content, statistics, and structure
  without downloading entire datasets locally.

  See: https://huggingface.co/docs/dataset-viewer

  ## Example

      # Check if a dataset is available in the viewer
      {:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")

      # Get the first 100 rows of a dataset split
      {:ok, rows} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
        split: "train",
        offset: 0,
        length: 10
      )

      # Get dataset statistics
      {:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
        config: "plain_text",
        split: "train"
      )
  """

  alias HuggingfaceClient.Error.HubApiError

  @viewer_base "https://datasets-server.huggingface.co"

  # ── Metadata ──────────────────────────────────────────────────────────────────

  @doc """
  Checks whether a dataset is available in the Dataset Viewer.

  Returns `{:ok, %{"preview" => bool, "viewer" => bool}}` or an error.

  ## Example

      {:ok, info} = HuggingfaceClient.dataset_viewer_valid?("rajpurkar/squad")
  """
  @spec valid?(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def valid?(dataset_id, opts \\ []) do
    get_viewer("/is-valid", [{"dataset", dataset_id}], opts)
  end

  @doc """
  Returns the list of configurations (subsets) for a dataset.

  ## Example

      {:ok, configs} = HuggingfaceClient.dataset_viewer_configs("glue")
      Enum.each(configs, fn c -> IO.puts(c["config_name"]) end)
  """
  @spec list_configs(String.t(), keyword()) :: {:ok, [map()]} | {:error, Exception.t()}
  def list_configs(dataset_id, opts \\ []) do
    case get_viewer("/configs", [{"dataset", dataset_id}], opts) do
      {:ok, %{"configs" => configs}} -> {:ok, configs}
      {:ok, other} -> {:ok, List.wrap(other)}
      err -> err
    end
  end

  @doc """
  Returns the list of splits for a given dataset config.

  ## Example

      {:ok, splits} = HuggingfaceClient.dataset_viewer_splits("rajpurkar/squad",
        config: "plain_text"
      )
      Enum.each(splits, fn s -> IO.puts(s["split"]) end)
  """
  @spec list_splits(String.t(), keyword()) :: {:ok, [map()]} | {:error, Exception.t()}
  def list_splits(dataset_id, opts \\ []) do
    config = opts[:config]
    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params

    case get_viewer("/splits", params, opts) do
      {:ok, %{"splits" => splits}} -> {:ok, splits}
      {:ok, other} -> {:ok, List.wrap(other)}
      err -> err
    end
  end

  # ── Content ───────────────────────────────────────────────────────────────────

  @doc """
  Returns the first 100 rows of a dataset split (as a preview).

  ## Options

  - `:config` — configuration/subset name
  - `:split` — split name (e.g. `"train"`, `"test"`)
  - `:token` — HF API token for private datasets

  ## Example

      {:ok, preview} = HuggingfaceClient.dataset_viewer_first_rows("rajpurkar/squad",
        config: "plain_text",
        split: "train"
      )
      Enum.each(preview["rows"], fn r -> IO.inspect(r["row"]) end)
  """
  @spec first_rows(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def first_rows(dataset_id, opts \\ []) do
    config = opts[:config]
    split = opts[:split]

    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params

    get_viewer("/first-rows", params, opts)
  end

  @doc """
  Returns rows from a dataset split with pagination.

  ## Options

  - `:config` — configuration name
  - `:split` — split name (required)
  - `:offset` — row offset (default: 0)
  - `:length` — number of rows to return, max 100 (default: 100)
  - `:access_token`

  ## Example

      {:ok, result} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
        config: "plain_text",
        split: "train",
        offset: 0,
        length: 10
      )

      IO.puts("Total rows: \#{result["num_rows_total"]}")
      Enum.each(result["rows"], fn r -> IO.inspect(r["row"]) end)
  """
  @spec rows(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def rows(dataset_id, opts \\ []) do
    config = opts[:config]
    split = opts[:split]
    offset = opts[:offset] || 0
    length = min(opts[:length] || 100, 100)

    params = [{"dataset", dataset_id}, {"offset", offset}, {"length", length}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params

    get_viewer("/rows", params, opts)
  end

  @doc """
  Performs a search across all rows in a split.

  ## Options

  - `:query` — search query string (required)
  - `:config` — configuration name
  - `:split` — split name (required)
  - `:offset` — result offset
  - `:length` — number of results, max 100

  ## Example

      {:ok, results} = HuggingfaceClient.dataset_viewer_search("rajpurkar/squad",
        query: "Albert Einstein",
        config: "plain_text",
        split: "train"
      )
  """
  @spec search(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def search(dataset_id, opts \\ []) do
    query = opts[:query] || raise HuggingfaceClient.Error.InputError, ":query is required"
    config = opts[:config]
    split = opts[:split]
    offset = opts[:offset] || 0
    length = min(opts[:length] || 100, 100)

    params = [{"dataset", dataset_id}, {"query", query}, {"offset", offset}, {"length", length}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params

    get_viewer("/search", params, opts)
  end

  @doc """
  Returns column index/statistics for a dataset split.

  ## Example

      {:ok, index} = HuggingfaceClient.dataset_viewer_column_index("rajpurkar/squad",
        config: "plain_text",
        split: "train"
      )
  """
  @spec column_index(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def column_index(dataset_id, opts \\ []) do
    config = opts[:config]
    split = opts[:split]

    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params

    get_viewer("/column-index", params, opts)
  end

  # ── Statistics ────────────────────────────────────────────────────────────────

  @doc """
  Returns descriptive statistics for each column in a dataset split.

  ## Example

      {:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
        config: "plain_text",
        split: "train"
      )

      stats["statistics"]
      |> Enum.each(fn col ->
        IO.puts("\#{col["column_name"]}: type=\#{col["column_type"]}")
      end)
  """
  @spec statistics(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def statistics(dataset_id, opts \\ []) do
    config = opts[:config]
    split = opts[:split]

    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params

    get_viewer("/statistics", params, opts)
  end

  @doc """
  Returns the Parquet file URLs for a dataset (if available in Parquet format).

  ## Example

      {:ok, result} = HuggingfaceClient.dataset_viewer_parquet("rajpurkar/squad")
      result["parquet_files"] |> Enum.each(fn f ->
        IO.puts("\#{f["split"]}: \#{f["url"]}")
      end)
  """
  @spec parquet(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def parquet(dataset_id, opts \\ []) do
    config = opts[:config]
    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    get_viewer("/parquet", params, opts)
  end

  @doc """
  Returns dataset info (size, number of rows, features schema, etc.).

  ## Example

      {:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")
      IO.inspect(info["dataset_info"])
  """
  @spec info(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def info(dataset_id, opts \\ []) do
    config = opts[:config]
    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    get_viewer("/info", params, opts)
  end

  @doc """
  Returns the feature schema (column types) for a dataset split.

  ## Example

      {:ok, features} = HuggingfaceClient.dataset_viewer_features("rajpurkar/squad",
        config: "plain_text",
        split: "train"
      )
  """
  @spec features(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def features(dataset_id, opts \\ []) do
    config = opts[:config]
    split = opts[:split]
    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    params = if split, do: [{"split", split} | params], else: params
    get_viewer("/dataset-info", params, opts)
  end

  @doc """
  Returns dataset size (rows, bytes) per split.

  ## Example

      {:ok, size} = HuggingfaceClient.dataset_viewer_size("rajpurkar/squad")
      IO.inspect(size["size"])
  """
  @spec size(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
  def size(dataset_id, opts \\ []) do
    config = opts[:config]
    params = [{"dataset", dataset_id}]
    params = if config, do: [{"config", config} | params], else: params
    get_viewer("/size", params, opts)
  end

  # ── Private ───────────────────────────────────────────────────────────────────

  defp get_viewer(path, params, opts) do
    token = opts[:access_token]
    query = URI.encode_query(params)
    url = "#{@viewer_base}#{path}?#{query}"

    auth_headers = if token, do: [{"authorization", "Bearer #{token}"}], else: []

    req =
      Req.new(
        url: url,
        headers: auth_headers,
        finch: HuggingfaceClient.Finch,
        retry: false,
        receive_timeout: 30_000
      )

    case Req.get(req) do
      {:ok, %Req.Response{status: s, body: body}} when s in 200..299 ->
        {:ok, body}

      {:ok, %Req.Response{status: s, body: b}} ->
        {:error,
         HubApiError.exception(
           {"Dataset Viewer error (HTTP #{s})", %{url: url}, %{status: s, request_id: "", body: b}}
         )}

      {:error, reason} ->
        {:error,
         HubApiError.exception(
           {"Dataset Viewer network error: #{inspect(reason)}", %{url: url},
            %{status: 0, request_id: "", body: ""}}
         )}
    end
  end
end