notebooks/micrograd_extras.livemd

# MicrogradEx Extras

This notebook goes beyond the official parity path in `micrograd_demo.livemd`. The examples are bounded so they stay usable with scalar autodiff.

## Setup

```elixir
micrograd_ex_path =
  [
    System.get_env("MICROGRAD_EX_PATH"),
    Path.expand("..", __DIR__),
    Path.expand(".", __DIR__),
    File.cwd!(),
    Path.expand("micrograd_ex", File.cwd!())
  ]
  |> Enum.reject(&is_nil/1)
  |> Enum.find(fn path ->
    File.exists?(Path.join(path, "mix.exs")) and
      File.exists?(Path.join(path, "lib/micrograd_ex.ex"))
  end) ||
    raise """
    Could not locate the MicrogradEx Mix project.

    Set MICROGRAD_EX_PATH to the repository path, for example:
    /home/home/p/g/n/learning/micrograd_ex
    """

Mix.install([
  {:micrograd_ex, path: micrograd_ex_path},
  {:kino, "~> 0.14"},
  {:kino_vega_lite, "~> 0.1"},
  {:vega_lite, "~> 0.1"}
])

alias VegaLite, as: Vl

alias MicrogradEx.NN
alias MicrogradEx.NN.MLP
alias MicrogradEx.Datasets
alias MicrogradEx.Losses
alias MicrogradEx.Trainer
alias MicrogradEx.PlotData
```

## 1. Why this notebook exists

The main notebook stays close to the official micrograd demo. This notebook explores the knobs people naturally ask about: dataset shape, noise, model size, regularization, learning rate, decision-boundary resolution, and controlled failure modes.

The cells use small sample counts and short training runs. Increase them only when you are ready to wait for scalar training.

```elixir
seed = {1, 2, 3}
```

## 2. Dataset explorer

These static examples cover the same knobs you would expose as bounded controls: dataset type, noise, and sample count.

```elixir
dataset_examples = [
  moons_clean: Datasets.moons(80, noise: 0.0, seed: seed),
  moons_noisy: Datasets.moons(80, noise: 0.2, seed: seed),
  spiral: Datasets.spiral(80, noise: 0.05, seed: seed),
  blobs: Datasets.blobs(80, noise: 0.25, seed: seed)
]

dataset_rows =
  Enum.flat_map(dataset_examples, fn {name, dataset} ->
    dataset
    |> PlotData.dataset_points()
    |> Enum.map(&Map.put(&1, :dataset, Atom.to_string(name)))
  end)

Vl.new(width: 560, height: 360)
|> Vl.data_from_values(dataset_rows)
|> Vl.mark(:point, filled: true, size: 65)
|> Vl.encode_field(:x, "x", type: :quantitative)
|> Vl.encode_field(:y, "y", type: :quantitative)
|> Vl.encode_field(:color, "label", type: :nominal)
|> Vl.encode_field(:shape, "dataset", type: :nominal)
```

## 3. Architecture comparison

This compares model capacity on a small moons dataset. The `[32, 32, 1]` model is included but kept to a short run.

```elixir
comparison_dataset = Datasets.moons(40, noise: 0.1, seed: seed)

architecture_specs = [
  small: [8, 8, 1],
  official: [16, 16, 1],
  large: [32, 32, 1]
]

architecture_results =
  Enum.map(architecture_specs, fn {name, layers} ->
    model = MLP.new(2, layers, seed: seed)
    initial = Losses.max_margin(model, comparison_dataset.xs, comparison_dataset.ys)

    run =
      Trainer.train(model, comparison_dataset,
        steps: 8,
        alpha: 1.0e-4,
        learning_rate: fn k -> 0.35 - 0.2 * k / 8.0 end
      )

    %{
      name: Atom.to_string(name),
      layers: inspect(layers),
      parameter_count: NN.parameter_count(model),
      initial_loss: initial.total_loss.data,
      final_loss: run.final_loss,
      final_accuracy_percent: run.final_accuracy * 100.0
    }
  end)

architecture_results
|> Kino.DataTable.new()
```

## 4. Regularization comparison

L2 regularization changes the balance between fitting the data and keeping parameters small.

```elixir
regularization_specs = [
  no_regularization: 0.0,
  default: 1.0e-4,
  stronger: 1.0e-2
]

regularization_results =
  Enum.map(regularization_specs, fn {name, alpha} ->
    model = MLP.new(2, [8, 8, 1], seed: seed)

    run =
      Trainer.train(model, comparison_dataset,
        steps: 8,
        alpha: alpha,
        learning_rate: 0.25
      )

    abs_mean =
      run.final_model
      |> NN.parameters()
      |> Enum.map(&abs(&1.data))
      |> then(&(Enum.sum(&1) / length(&1)))

    final_row = List.last(run.history)

    %{
      name: Atom.to_string(name),
      alpha: alpha,
      final_loss: run.final_loss,
      data_loss: final_row.data_loss,
      reg_loss: final_row.reg_loss,
      final_accuracy_percent: run.final_accuracy * 100.0,
      mean_abs_parameter: abs_mean
    }
  end)

regularization_results
|> Kino.DataTable.new()
```

## 5. Learning-rate comparison

The learning-rate schedule can matter as much as model size.

```elixir
learning_rate_specs = [
  constant_0_1: 0.1,
  constant_0_5: 0.5,
  official_decay: &Trainer.official_micrograd_learning_rate/1
]

learning_rate_runs =
  Enum.map(learning_rate_specs, fn {name, learning_rate} ->
    model = MLP.new(2, [8, 8, 1], seed: seed)

    run =
      Trainer.train(model, comparison_dataset,
        steps: 8,
        alpha: 1.0e-4,
        learning_rate: learning_rate
      )

    {Atom.to_string(name), run}
  end)

learning_rate_rows =
  Enum.flat_map(learning_rate_runs, fn {name, run} ->
    Enum.map(run.history, fn row ->
      %{run: name, step: row.step, metric: "loss", value: row.loss}
    end)
  end)

Vl.new(width: 640, height: 280)
|> Vl.data_from_values(learning_rate_rows)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "step", type: :quantitative)
|> Vl.encode_field(:y, "value", type: :quantitative)
|> Vl.encode_field(:color, "run", type: :nominal)
```

## 6. Decision-boundary resolution

Smaller `h` values produce smoother-looking boundaries but require many more scalar forward passes.

```elixir
boundary_model = MLP.new(2, [8, 8, 1], seed: seed)

boundary_run =
  Trainer.train(boundary_model, comparison_dataset,
    steps: 12,
    alpha: 1.0e-4,
    learning_rate: 0.25
  )

resolution_results =
  [coarse: 0.5, default: 0.25, fine: 0.15]
  |> Enum.map(fn {name, h} ->
    boundary = PlotData.decision_boundary(boundary_run.final_model, comparison_dataset, h: h)

    %{
      name: Atom.to_string(name),
      h: h,
      grid_points: length(boundary)
    }
  end)

resolution_results
|> Kino.DataTable.new()
```

## 7. Spiral dataset challenge

Spirals are harder than moons. Do not expect perfect accuracy from every short scalar run.

```elixir
spiral =
  Datasets.spiral(60,
    noise: 0.05,
    turns: 1.5,
    seed: seed
  )

spiral_model = MLP.new(2, [16, 16, 1], seed: seed)

spiral_run =
  Trainer.train(spiral_model, spiral,
    steps: 20,
    alpha: 1.0e-4,
    learning_rate: fn k -> 0.4 - 0.25 * k / 20.0 end
  )

%{
  final_loss: spiral_run.final_loss,
  final_accuracy_percent: spiral_run.final_accuracy * 100.0
}
```

```elixir
spiral_boundary = PlotData.decision_boundary(spiral_run.final_model, spiral, h: 0.35)
spiral_points = PlotData.dataset_points(spiral)

spiral_background =
  Vl.new()
  |> Vl.data_from_values(spiral_boundary)
  |> Vl.mark(:point, filled: true, opacity: 0.25, size: 60)
  |> Vl.encode_field(:x, "x", type: :quantitative)
  |> Vl.encode_field(:y, "y", type: :quantitative)
  |> Vl.encode_field(:color, "predicted", type: :nominal)

spiral_foreground =
  Vl.new()
  |> Vl.data_from_values(spiral_points)
  |> Vl.mark(:point, filled: true, size: 75, stroke: "black", strokeWidth: 1)
  |> Vl.encode_field(:x, "x", type: :quantitative)
  |> Vl.encode_field(:y, "y", type: :quantitative)
  |> Vl.encode_field(:color, "label", type: :nominal)

Vl.new(width: 520, height: 420)
|> Vl.layers([spiral_background, spiral_foreground])
```

## 8. Failure modes

With high noise and very little capacity, the model may reduce loss but still draw a poor boundary. This is expected: there is not enough clean signal or model capacity.

```elixir
failure_dataset = Datasets.moons(40, noise: 0.35, seed: seed)
failure_model = MLP.new(2, [1, 1], seed: seed)
failure_initial = Losses.max_margin(failure_model, failure_dataset.xs, failure_dataset.ys)

failure_run =
  Trainer.train(failure_model, failure_dataset,
    steps: 8,
    alpha: 1.0e-4,
    learning_rate: 0.2
  )

%{
  parameter_count: NN.parameter_count(failure_model),
  initial_loss: failure_initial.total_loss.data,
  final_loss: failure_run.final_loss,
  final_accuracy_percent: failure_run.final_accuracy * 100.0
}
```

## 9. Things to try next

Try changing one setting at a time:

* increase `steps` for the spiral challenge;
* compare `[4, 4, 1]` and `[32, 32, 1]`;
* increase `noise` and watch accuracy;
* set `alpha: 0.0`;
* make the decision-boundary grid coarser with `h: 0.5`.