defmodule HuggingfaceClient.Hub.DatasetViewer do
@moduledoc """
HuggingFace Dataset Viewer API.
Provides programmatic access to dataset content, statistics, and structure
without downloading entire datasets locally.
See: https://huggingface.co/docs/dataset-viewer
## Example
# Check if a dataset is available in the viewer
{:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")
# Get the first 100 rows of a dataset split
{:ok, rows} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
split: "train",
offset: 0,
length: 10
)
# Get dataset statistics
{:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
config: "plain_text",
split: "train"
)
"""
alias HuggingfaceClient.Error.HubApiError
@viewer_base "https://datasets-server.huggingface.co"
# ── Metadata ──────────────────────────────────────────────────────────────────
@doc """
Checks whether a dataset is available in the Dataset Viewer.
Returns `{:ok, %{"preview" => bool, "viewer" => bool}}` or an error.
## Example
{:ok, info} = HuggingfaceClient.dataset_viewer_valid?("rajpurkar/squad")
"""
@spec valid?(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def valid?(dataset_id, opts \\ []) do
get_viewer("/is-valid", [{"dataset", dataset_id}], opts)
end
@doc """
Returns the list of configurations (subsets) for a dataset.
## Example
{:ok, configs} = HuggingfaceClient.dataset_viewer_configs("glue")
Enum.each(configs, fn c -> IO.puts(c["config_name"]) end)
"""
@spec list_configs(String.t(), keyword()) :: {:ok, [map()]} | {:error, Exception.t()}
def list_configs(dataset_id, opts \\ []) do
case get_viewer("/configs", [{"dataset", dataset_id}], opts) do
{:ok, %{"configs" => configs}} -> {:ok, configs}
{:ok, other} -> {:ok, List.wrap(other)}
err -> err
end
end
@doc """
Returns the list of splits for a given dataset config.
## Example
{:ok, splits} = HuggingfaceClient.dataset_viewer_splits("rajpurkar/squad",
config: "plain_text"
)
Enum.each(splits, fn s -> IO.puts(s["split"]) end)
"""
@spec list_splits(String.t(), keyword()) :: {:ok, [map()]} | {:error, Exception.t()}
def list_splits(dataset_id, opts \\ []) do
config = opts[:config]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
case get_viewer("/splits", params, opts) do
{:ok, %{"splits" => splits}} -> {:ok, splits}
{:ok, other} -> {:ok, List.wrap(other)}
err -> err
end
end
# ── Content ───────────────────────────────────────────────────────────────────
@doc """
Returns the first 100 rows of a dataset split (as a preview).
## Options
- `:config` — configuration/subset name
- `:split` — split name (e.g. `"train"`, `"test"`)
- `:token` — HF API token for private datasets
## Example
{:ok, preview} = HuggingfaceClient.dataset_viewer_first_rows("rajpurkar/squad",
config: "plain_text",
split: "train"
)
Enum.each(preview["rows"], fn r -> IO.inspect(r["row"]) end)
"""
@spec first_rows(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def first_rows(dataset_id, opts \\ []) do
config = opts[:config]
split = opts[:split]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/first-rows", params, opts)
end
@doc """
Returns rows from a dataset split with pagination.
## Options
- `:config` — configuration name
- `:split` — split name (required)
- `:offset` — row offset (default: 0)
- `:length` — number of rows to return, max 100 (default: 100)
- `:access_token`
## Example
{:ok, result} = HuggingfaceClient.dataset_viewer_rows("rajpurkar/squad",
config: "plain_text",
split: "train",
offset: 0,
length: 10
)
IO.puts("Total rows: \#{result["num_rows_total"]}")
Enum.each(result["rows"], fn r -> IO.inspect(r["row"]) end)
"""
@spec rows(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def rows(dataset_id, opts \\ []) do
config = opts[:config]
split = opts[:split]
offset = opts[:offset] || 0
length = min(opts[:length] || 100, 100)
params = [{"dataset", dataset_id}, {"offset", offset}, {"length", length}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/rows", params, opts)
end
@doc """
Performs a search across all rows in a split.
## Options
- `:query` — search query string (required)
- `:config` — configuration name
- `:split` — split name (required)
- `:offset` — result offset
- `:length` — number of results, max 100
## Example
{:ok, results} = HuggingfaceClient.dataset_viewer_search("rajpurkar/squad",
query: "Albert Einstein",
config: "plain_text",
split: "train"
)
"""
@spec search(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def search(dataset_id, opts \\ []) do
query = opts[:query] || raise HuggingfaceClient.Error.InputError, ":query is required"
config = opts[:config]
split = opts[:split]
offset = opts[:offset] || 0
length = min(opts[:length] || 100, 100)
params = [{"dataset", dataset_id}, {"query", query}, {"offset", offset}, {"length", length}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/search", params, opts)
end
@doc """
Returns column index/statistics for a dataset split.
## Example
{:ok, index} = HuggingfaceClient.dataset_viewer_column_index("rajpurkar/squad",
config: "plain_text",
split: "train"
)
"""
@spec column_index(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def column_index(dataset_id, opts \\ []) do
config = opts[:config]
split = opts[:split]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/column-index", params, opts)
end
# ── Statistics ────────────────────────────────────────────────────────────────
@doc """
Returns descriptive statistics for each column in a dataset split.
## Example
{:ok, stats} = HuggingfaceClient.dataset_viewer_statistics("rajpurkar/squad",
config: "plain_text",
split: "train"
)
stats["statistics"]
|> Enum.each(fn col ->
IO.puts("\#{col["column_name"]}: type=\#{col["column_type"]}")
end)
"""
@spec statistics(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def statistics(dataset_id, opts \\ []) do
config = opts[:config]
split = opts[:split]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/statistics", params, opts)
end
@doc """
Returns the Parquet file URLs for a dataset (if available in Parquet format).
## Example
{:ok, result} = HuggingfaceClient.dataset_viewer_parquet("rajpurkar/squad")
result["parquet_files"] |> Enum.each(fn f ->
IO.puts("\#{f["split"]}: \#{f["url"]}")
end)
"""
@spec parquet(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def parquet(dataset_id, opts \\ []) do
config = opts[:config]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
get_viewer("/parquet", params, opts)
end
@doc """
Returns dataset info (size, number of rows, features schema, etc.).
## Example
{:ok, info} = HuggingfaceClient.dataset_viewer_info("rajpurkar/squad")
IO.inspect(info["dataset_info"])
"""
@spec info(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def info(dataset_id, opts \\ []) do
config = opts[:config]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
get_viewer("/info", params, opts)
end
@doc """
Returns the feature schema (column types) for a dataset split.
## Example
{:ok, features} = HuggingfaceClient.dataset_viewer_features("rajpurkar/squad",
config: "plain_text",
split: "train"
)
"""
@spec features(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def features(dataset_id, opts \\ []) do
config = opts[:config]
split = opts[:split]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
params = if split, do: [{"split", split} | params], else: params
get_viewer("/dataset-info", params, opts)
end
@doc """
Returns dataset size (rows, bytes) per split.
## Example
{:ok, size} = HuggingfaceClient.dataset_viewer_size("rajpurkar/squad")
IO.inspect(size["size"])
"""
@spec size(String.t(), keyword()) :: {:ok, map()} | {:error, Exception.t()}
def size(dataset_id, opts \\ []) do
config = opts[:config]
params = [{"dataset", dataset_id}]
params = if config, do: [{"config", config} | params], else: params
get_viewer("/size", params, opts)
end
# ── Private ───────────────────────────────────────────────────────────────────
defp get_viewer(path, params, opts) do
token = opts[:access_token]
query = URI.encode_query(params)
url = "#{@viewer_base}#{path}?#{query}"
auth_headers = if token, do: [{"authorization", "Bearer #{token}"}], else: []
req =
Req.new(
url: url,
headers: auth_headers,
finch: HuggingfaceClient.Finch,
retry: false,
receive_timeout: 30_000
)
case Req.get(req) do
{:ok, %Req.Response{status: s, body: body}} when s in 200..299 ->
{:ok, body}
{:ok, %Req.Response{status: s, body: b}} ->
{:error,
HubApiError.exception(
{"Dataset Viewer error (HTTP #{s})", %{url: url}, %{status: s, request_id: "", body: b}}
)}
{:error, reason} ->
{:error,
HubApiError.exception(
{"Dataset Viewer network error: #{inspect(reason)}", %{url: url},
%{status: 0, request_id: "", body: ""}}
)}
end
end
end