lib/scholar/cluster/dbscan.ex

defmodule Scholar.Cluster.DBSCAN do
  @moduledoc """
  Perform DBSCAN clustering from vector array or distance matrix.

  DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
  Finds core samples of high density and expands clusters from them.
  Good for data which contains clusters of similar density.
  """
  import Nx.Defn
  import Scholar.Shared

  @derive {Nx.Container, containers: [:core_sample_indices, :labels]}
  defstruct [:core_sample_indices, :labels]

  opts = [
    eps: [
      default: 0.5,
      doc: """
      The maximum distance between two samples for them to be considered as in the same neighborhood.
      """,
      type: {:custom, Scholar.Options, :positive_number, []}
    ],
    min_samples: [
      default: 5,
      doc: """
      The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
      This includes the point itself.
      """,
      type: :integer
    ],
    metric: [
      type: {:custom, Scholar.Options, :metric, []},
      default: {:minkowski, 2},
      doc: ~S"""
      Name of the metric. Possible values:

      * `{:minkowski, p}` - Minkowski metric. By changing value of `p` parameter (a positive number or `:infinity`)
        we can set Manhattan (`1`), Euclidean (`2`), Chebyshev (`:infinity`), or any arbitrary $L_p$ metric.

      * `:cosine` - Cosine metric.
      """
    ],
    weights: [
      type: {:custom, Scholar.Options, :weights, []},
      doc: """
      The weights for each observation in `x`. If equals to `nil`,
      all observations are assigned equal weight.
      """
    ]
  ]

  @opts_schema NimbleOptions.new!(opts)

  @doc """
  Perform DBSCAN clustering from vector array or distance matrix.

  ## Options

  #{NimbleOptions.docs(@opts_schema)}

  ## Return Values

  The function returns a struct with the following parameters:

  * `:core_sample_indices` - Indices of core samples represented as a mask.
    The mask is a boolean array of shape `{num_samples}` where `1` indicates
    that the corresponding sample is a core sample and `0` otherwise.

  * `:labels` - Cluster labels for each point in the dataset given to fit().
    Noisy samples are given the label `-1`.

  ## Examples

      iex> x = Nx.tensor([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]])
      iex> Scholar.Cluster.DBSCAN.fit(x, eps: 3, min_samples: 2)
      %Scholar.Cluster.DBSCAN{
        core_sample_indices: Nx.tensor(
          [1, 1, 1, 1, 1, 0], type: :u8
        ),
        labels: Nx.tensor(
          [0, 0, 0, 1, 1, -1]
        )
      }
  """
  deftransform fit(x, opts \\ []) do
    fit_n(x, NimbleOptions.validate!(opts, @opts_schema))
  end

  defnp fit_n(x, opts) do
    num_samples = Nx.axis_size(x, 0)
    weights = validate_weights(opts[:weights], num_samples, type: to_float_type(x))
    weights = if Nx.rank(weights) == 0, do: weights, else: Nx.new_axis(weights, -1)
    y_dummy = Nx.broadcast(Nx.tensor(0), {num_samples})

    neighbor_model =
      Scholar.Neighbors.RadiusNearestNeighbors.fit(x, y_dummy,
        num_classes: 1,
        radius: opts[:eps],
        metric: opts[:metric]
      )

    {_dist, indices} =
      Scholar.Neighbors.RadiusNearestNeighbors.radius_neighbors(neighbor_model, x)

    n_neigbors = Nx.sum(indices * weights, axes: [1])
    core_samples = n_neigbors >= opts[:min_samples]
    labels = dbscan_inner(core_samples, indices)

    %__MODULE__{
      core_sample_indices: core_samples,
      labels: labels
    }
  end

  defnp dbscan_inner(is_core?, indices) do
    {labels, _} =
      while {labels = Nx.broadcast(0, {Nx.axis_size(indices, 0)}),
             {indices, is_core?, label_num = 1, i = 0}},
            i < Nx.axis_size(indices, 0) do
        stack = Nx.broadcast(0, {Nx.axis_size(indices, 0) ** 2})
        stack_ptr = 0

        if Nx.take(labels, i) != 0 or not Nx.take(is_core?, i) do
          {labels, {indices, is_core?, label_num, i + 1}}
        else
          {labels, _} =
            while {labels, {k = i, label_num, indices, is_core?, stack, stack_ptr}},
                  stack_ptr >= 0 do
              {labels, stack, stack_ptr} =
                if Nx.take(labels, k) == 0 do
                  labels =
                    Nx.indexed_put(
                      labels,
                      Nx.new_axis(Nx.new_axis(k, 0), 0),
                      Nx.new_axis(label_num, 0)
                    )

                  {stack, stack_ptr} =
                    if Nx.take(is_core?, k) do
                      neighb = Nx.take(indices, k)
                      mask = neighb * (labels == 0)

                      {stack, stack_ptr, _} =
                        while {stack, stack_ptr, {mask, j = 0}}, j < Nx.axis_size(mask, 0) do
                          if Nx.take(mask, j) != 0 do
                            stack =
                              Nx.indexed_put(
                                stack,
                                Nx.new_axis(Nx.new_axis(stack_ptr, 0), 0),
                                Nx.new_axis(j, 0)
                              )

                            {stack, stack_ptr + 1, {mask, j + 1}}
                          else
                            {stack, stack_ptr, {mask, j + 1}}
                          end
                        end

                      {stack, stack_ptr}
                    else
                      {stack, stack_ptr}
                    end

                  {labels, stack, stack_ptr}
                else
                  {labels, stack, stack_ptr}
                end

              k = if stack_ptr > 0, do: Nx.take(stack, stack_ptr - 1), else: -1
              stack_ptr = stack_ptr - 1
              {labels, {k, label_num, indices, is_core?, stack, stack_ptr}}
            end

          {labels, {indices, is_core?, label_num + 1, i + 1}}
        end
      end

    # we need to subtract 1 from labels because we started from label_num=1 which simplifies oprations
    labels - 1
  end
end