lib/archeometer/analysis/dsm.ex

defmodule Archeometer.Analysis.DSM do
  @moduledoc """
  Defines algorithms for Design Structure Matrix (DSM) analysis.

  A DSM represents a graph of dependencies between components or processes,
  in this case, between Elixir modules.

  See [DSM Web](https://dsmweb.org/introduction-to-dsm/).

  In this implementation, the DSM is represented by the `DSM` struct,
  with the following attributes:

  - `nodes`. A List of of nodes that besides enumerating the nodes, defines at the same
    time the order of rows and columns. They have always the same order.
  - `edges`. The edges of the graph, in the form of a map, with a key representing existing
    edges within the matrix in the form of `{row, col}`, and values can be one of `:+` or `:*`.
    `:+` is used when `row` and `col` have the same value, and `:*` is used whenever the module
    represented by `col` has a dependency on module represented by `row`.
  - `groups`. Its a map of identified cyclic groups within the graph, every key is just an
    identifier for the gruop, and the value is a list of the nodes conforming that gruop. The
    value of `groups` is calculated by the algorithm and not expected to be set by the user.
  - `n_groups`. This is used for internal use of the algorithms only and not expected to be
    of any use to final users.

  Functions `gen_dsm/1` and `gen_dsm/4` are used to create a `DSM` respectively from an
  adjacency list or from information in an `Archeometer` database.

  The only analysis algorith currently implemented is called `triangularization`, and you can
  use it with the function `triangularize/1`. It helps you find groups of nodes that form cycles.

  """
  alias Archeometer.Analysis.DSM

  defstruct nodes: [], edges: %{}, groups: %{}, n_groups: 0

  @doc """
  Creates a DSM from an adjacency list

  ## Examples

      iex> adj_list = %{
      ...>   1 => [2, 3],
      ...>   2 => [3, 4],
      ...>   3 => [4],
      ...>   4 => [3]
      ...> }
      %{1 => [2, 3], 2 => [3, 4], 3 => [4], 4 => [3]}
      iex> Archeometer.Analysis.DSM.gen_dsm(adj_list)
      %Archeometer.Analysis.DSM{
        edges: %{
          {1, 1} => :+,
          {2, 1} => :*,
          {2, 2} => :+,
          {3, 1} => :*,
          {3, 2} => :*,
          {3, 3} => :+,
          {3, 4} => :*,
          {4, 2} => :*,
          {4, 3} => :*,
          {4, 4} => :+
        },
        groups: %{},
        n_groups: 0,
        nodes: [1, 2, 3, 4]
      }

  """
  def gen_dsm(xrefs) do
    edges = gen_edges(xrefs)
    nodes = gen_nodes(xrefs)

    %__MODULE__{nodes: nodes, edges: edges}
  end

  @doc """
  Generates a DSM using module xref information from an Archeometer DB.

  ## Parameters

  - `app` is an application name, in the case of an umbrella project, it's the name of
    one of the child applications.
  - `namespace` is a "namespace" to narrow the set of modules considered for createing the
    DSM. The term "namespace" is meant very broadly since Elixir doesn't have that concept.
    Namespace can be interpreted as a string matching the beginning of the module names to
    be considered, such as "Foo" or "Foo.Bar", after which a last part of the name must
    be given to create a full module name. For example the "namespace" `Foo` will include
    in the analysis modules such as `Foo.Bar` and `Foo.Baz` but not `FooBar` or `Food`.
  - `db_name` is the name of the database file to be used.
  - `skip_tests` is a boolean indicating if test related modules should be skipped.

  ## Examples

  Using Archeometer provided test database.

      iex> Archeometer.Analysis.DSM.gen_dsm(
      ...>  "jissai",
      ...>  "Jissai.Reports",
      ...>  "./test/resources/db/archeometer_jissai.db",
      ...>  true
      ...>)
      {:ok,
       %Archeometer.Analysis.DSM{
         edges: %{
           {21, 21} => :+,
           {22, 21} => :*,
           {22, 22} => :+,
           {23, 21} => :*,
           {23, 23} => :+,
           {24, 21} => :*,
           {24, 24} => :+
         },
         groups: %{},
         n_groups: 0,
         nodes: [21, 22, 23, 24]
       },
       %{
         21 => "Jissai.Reports",
         22 => "Jissai.Reports.Attendance",
         23 => "Jissai.Reports.ClassDynamic",
         24 => "Jissai.Reports.Participant"
       }}
  """
  def gen_dsm(app, namespace, db_name, skip_tests) do
    {module_names, xrefs} = __MODULE__.Data.get_data(app, namespace, db_name, skip_tests)

    if map_size(module_names) == 0 do
      {:error, :no_modules_matched}
    else
      dsm = gen_dsm(xrefs)

      {:ok, dsm, module_names}
    end
  end

  @doc """
  Reorders a DSM to find dependency cycles.

  The cycles are identified in the `groups` attribute of the resulting DSM.

  ## Examples

  A DSM with one cycle

      iex> dsm = %Archeometer.Analysis.DSM{
      ...>   edges: %{
      ...>     {1, 1} => :+,
      ...>     {2, 1} => :*,
      ...>     {2, 2} => :+,
      ...>     {3, 1} => :*,
      ...>     {3, 2} => :*,
      ...>     {3, 3} => :+,
      ...>     {3, 4} => :*,
      ...>     {4, 2} => :*,
      ...>     {4, 3} => :*,
      ...>     {4, 4} => :+
      ...>   },
      ...>   groups: %{},
      ...>   n_groups: 0,
      ...>   nodes: [1, 2, 3, 4]
      ...> }
      iex> Archeometer.Analysis.DSM.triangularize(dsm)
      %Archeometer.Analysis.DSM{
        edges: %{
          {1, 1} => :+,
          {2, 1} => :*,
          {2, 2} => :+,
          {3, 1} => :*,
          {3, 2} => :*,
          {3, 3} => :+,
          {3, 4} => :*,
          {4, 2} => :*,
          {4, 3} => :*,
          {4, 4} => :+
        },
        groups: %{"g1" => [3, 4]},
        n_groups: 1,
        nodes: [1, 2, 3, 4]
      }

  As you can see, it detected the only existing cycle, formed by nodes `[3, 4]`. The nodes
  of the DSM are reordered such that the only nodes above the diagonal, are the ones
  forming part of a cycle.

  Now a DSM with two cycles

      iex> dsm = %Archeometer.Analysis.DSM{
      ...>   edges: %{
      ...>     {1, 1} => :+,
      ...>     {2, 1} => :*,
      ...>     {2, 2} => :+,
      ...>     {2, 3} => :*,
      ...>     {3, 1} => :*,
      ...>     {3, 2} => :*,
      ...>     {3, 3} => :+,
      ...>     {4, 4} => :+,
      ...>     {4, 5} => :*,
      ...>     {5, 4} => :*,
      ...>     {5, 5} => :+
      ...>   },
      ...>   groups: %{},
      ...>   n_groups: 0,
      ...>   nodes: [1, 2, 3, 4, 5]
      ...> }
      iex> Archeometer.Analysis.DSM.triangularize(dsm)
      %Archeometer.Analysis.DSM{
        edges: %{
          {1, 1} => :+,
          {2, 1} => :*,
          {2, 2} => :+,
          {2, 3} => :*,
          {3, 1} => :*,
          {3, 2} => :*,
          {3, 3} => :+,
          {4, 4} => :+,
          {4, 5} => :*,
          {5, 4} => :*,
          {5, 5} => :+
        },
        groups: %{"g1" => [2, 3], "g2" => [4, 5]},
        n_groups: 2,
        nodes: [1, 2, 3, 4, 5]
      }

  It correctly detects the two cycles: `[2, 3]` and `[4, 5]`.

  Now a DSM with a cycle of 3 nodes

      iex> dsm = %Archeometer.Analysis.DSM{
      ...>   edges: %{
      ...>     {1, 1} => :+,
      ...>     {2, 1} => :*,
      ...>     {2, 2} => :+,
      ...>     {2, 4} => :*,
      ...>     {3, 2} => :*,
      ...>     {3, 3} => :+,
      ...>     {4, 3} => :*,
      ...>     {4, 4} => :+
      ...>   },
      ...>   groups: %{},
      ...>   n_groups: 0,
      ...>   nodes: [1, 2, 3, 4]
      ...> }
      iex> Archeometer.Analysis.DSM.triangularize(dsm)
      %Archeometer.Analysis.DSM{
        edges: %{
          {1, 1} => :+,
          {2, 1} => :*,
          {2, 2} => :+,
          {2, 4} => :*,
          {3, 2} => :*,
          {3, 3} => :+,
          {4, 3} => :*,
          {4, 4} => :+
        },
        groups: %{"g1" => [2, 3, 4]},
        n_groups: 1,
        nodes: [1, 2, 3, 4]
      }

  The cycle is correctly detected: `[2, 3, 4]`.

  """
  def triangularize(%__MODULE__{} = mtx) do
    mtx
    |> triangularize(left: [], right: [])
    |> Map.put(:edges, mtx.edges)
    |> cyclic_deps_groups()
  end

  defp gen_edges(xrefs) do
    Enum.reduce(xrefs, %{}, fn {module, callees}, edges ->
      Enum.reduce(callees, edges, fn callee, edges_acc ->
        Map.put(edges_acc, {callee, module}, :*)
      end)
      |> Map.put({module, module}, :+)
    end)
  end

  defp gen_nodes(xrefs) do
    xrefs
    |> Enum.flat_map(fn {k, v} -> [k | v] end)
    |> Enum.uniq()
    |> Enum.sort()
  end

  defp triangularize(
         %__MODULE__{nodes: [], edges: _, groups: groups} = dsm,
         left: left_nodes,
         right: right_nodes
       ) do
    {node_seq, groups} =
      case map_size(groups) do
        0 ->
          {left_nodes ++ right_nodes, groups}

        _ ->
          (left_nodes ++ right_nodes) |> expand_groups(groups)
      end

    %{dsm | nodes: node_seq, groups: groups}
  end

  defp triangularize(mtx, left: left_nodes, right: right_nodes) do
    # IO.inspect(mtx, label: "mtx", charlists: :as_lists)
    # IO.inspect(left_nodes, label: "left_nodes", charlists: :as_lists)
    # IO.inspect(right_nodes, label: "right_nodes", charlists: :as_lists)
    # IO.puts("-------------------------------")

    case {roots(mtx), leaves(mtx)} do
      {[], []} ->
        cycle = find_cycle(mtx)
        new_mtx = group_nodes(mtx, cycle)

        triangularize(
          new_mtx,
          left: left_nodes,
          right: right_nodes
        )

      {origins, []} ->
        left_nodes = left_nodes ++ origins
        nodes = mtx.nodes -- origins
        edges = remove_nodes(origins, mtx.edges)

        triangularize(
          %{mtx | nodes: nodes, edges: edges, groups: mtx.groups},
          left: left_nodes,
          right: right_nodes
        )

      {[], destinations} ->
        right_nodes = destinations ++ right_nodes
        nodes = mtx.nodes -- destinations
        edges = remove_nodes(destinations, mtx.edges)

        triangularize(
          %{mtx | nodes: nodes, edges: edges, groups: mtx.groups},
          left: left_nodes,
          right: right_nodes
        )

      {origins, destinations} ->
        # Avoid duplication between origins and destinations
        destinations = destinations -- origins

        left_nodes = left_nodes ++ origins
        right_nodes = destinations ++ right_nodes
        marked_nodes = origins ++ destinations
        nodes = mtx.nodes -- marked_nodes
        edges = remove_nodes(marked_nodes, mtx.edges)

        triangularize(
          %{mtx | nodes: nodes, edges: edges, groups: mtx.groups},
          left: left_nodes,
          right: right_nodes
        )
    end
  end

  defp expand_groups(seq, groups) do
    expanded_seq = expand_seq_once(seq, groups)
    expanded_groups = expand_groups(groups)

    if Enum.any?(expanded_seq, &is_group?/1) do
      expand_groups(expanded_seq, expanded_groups)
    else
      {expanded_seq, expanded_groups}
    end
  end

  defp expand_groups(groups) do
    groups
    |> Enum.map(fn {g, nodes} -> {g, expand_seq_once(nodes, groups)} end)
    |> Enum.into(%{})
  end

  defp expand_seq_once(seq, groups) do
    seq
    |> Enum.map(fn n -> if Map.has_key?(groups, n), do: Map.get(groups, n), else: n end)
    |> List.flatten()
  end

  defp is_group?(n), do: is_binary(n) and String.starts_with?(n, "g")

  defp group_nodes(mtx, cycle) do
    new_group_id = "g" <> to_string(mtx.n_groups + 1)
    groups = Map.put(mtx.groups, new_group_id, cycle)

    to_substitute =
      mtx.edges
      |> Enum.filter(fn {{r, c}, _v} -> Enum.member?(cycle, r) or Enum.member?(cycle, c) end)
      |> Enum.into(%{})

    edges =
      to_substitute
      |> Map.keys()
      |> Enum.reduce(mtx.edges, fn edge, acc -> Map.delete(acc, edge) end)

    edges =
      to_substitute
      |> Enum.map(fn {{r, c}, _v} ->
        row = if Enum.member?(cycle, r), do: new_group_id, else: r
        col = if Enum.member?(cycle, c), do: new_group_id, else: c
        {{row, col}, if(r == c, do: :+, else: :*)}
      end)
      |> Enum.into(edges)
      |> Map.put({new_group_id, new_group_id}, :+)

    nodes = [new_group_id | mtx.nodes -- cycle]

    %__MODULE__{nodes: nodes, edges: edges, groups: groups, n_groups: mtx.n_groups + 1}
  end

  def find_cycle(%__MODULE__{nodes: [n | _], edges: _edges} = mtx) do
    find_cycle(mtx, [[n]])
  end

  defp find_cycle(mtx, paths) do
    next_paths = Enum.flat_map(paths, &expand_path(mtx, &1))

    case Enum.find(next_paths, :none, &contains_cycle?/1) do
      :none ->
        find_cycle(mtx, next_paths)

      path ->
        path |> get_cycle() |> tl() |> Enum.reverse()
    end
  end

  defp expand_path(mtx, [n | _] = path) do
    mtx.edges
    |> Map.keys()
    |> Enum.filter(fn {r, c} -> r != c and c == n end)
    |> Enum.map(fn {r, _c} -> [r | path] end)
  end

  defp get_cycle(path) do
    anchor =
      path
      |> Enum.frequencies()
      |> Enum.find(fn {_k, v} -> v == 2 end)
      |> elem(0)

    [from, to] =
      path
      |> Enum.with_index()
      |> Enum.filter(fn {x, _idx} -> x == anchor end)
      |> Enum.map(&elem(&1, 1))

    Enum.slice(path, Range.new(from, to))
  end

  defp contains_cycle?(path) do
    unique_nodes = Enum.uniq(path)
    length(unique_nodes) != length(path)
  end

  defp remove_nodes(nodes, edges) do
    edges
    |> Enum.filter(fn {{r, c}, _v} ->
      not (Enum.member?(nodes, r) or Enum.member?(nodes, c))
    end)
    |> Enum.into(%{})
  end

  @doc """
  Finds the root nodes in a DSM.

  Root nodes are the ones that no othe node has dependencies upon.
  """
  def roots(mtx), do: Enum.filter(mtx.nodes, &(count_ingoing(mtx, &1) == 0))

  @doc """
  Finds the leave nodes in a DSM.

  Leave nodes are the ones that no dependencies upon other nodes.
  """
  def leaves(mtx), do: Enum.filter(mtx.nodes, &(count_outgoing(mtx, &1) == 0))

  defp count_ingoing(mtx, node) do
    Enum.count(mtx.edges, fn {{r, _c}, v} -> r == node and v != :+ end)
  end

  defp count_outgoing(mtx, node) do
    Enum.count(mtx.edges, fn {{_r, c}, v} -> c == node and v != :+ end)
  end

  @doc """
  Gets a list of module groups with cyclic dependencies.
  """
  def cyclic_deps_groups(dsm) do
    coords = dsm |> DSM.Utils.to_coord_mtx() |> Map.keys()

    cyclic_groups =
      dsm
      |> DSM.Utils.upper_right_group_corners()
      |> Enum.map(fn {r, c} -> corner_to_group(coords, dsm.nodes, r, c) end)
      |> Enum.with_index(1)
      |> Enum.map(fn {group, idx} -> {"g" <> to_string(idx), group} end)
      |> Enum.into(%{})

    %{dsm | groups: cyclic_groups, n_groups: map_size(cyclic_groups)}
  end

  defp corner_to_group(mtx_coords, nodes, r, c) do
    mtx_coords
    |> Enum.filter(fn {mtx_row, mtx_col} ->
      mtx_col >= r and mtx_col <= c and mtx_col != mtx_row
    end)
    |> Enum.map(fn {_r, c} -> c end)
    |> Enum.uniq()
    |> Enum.map(fn column -> Enum.at(nodes, column) end)
    |> Enum.sort()
  end

  @doc """
  Gets the largest group of modules with cyclical dependencies from a
  triangularized DSM.

  If both arguments are maps, then it is assumed they are the DSM and the
  modules.

  Otherwise there are treated as the namespace and the database name, and will
  perfom an extra query to get the DSM and the modules.
  """
  def largest_cyclic_deps_group(a, b)

  def largest_cyclic_deps_group(dsm, mod_names) when is_map(dsm) and is_map(mod_names) do
    dsm =
      dsm
      |> triangularize()
      |> cyclic_deps_groups()

    dsm.groups
    |> Map.values()
    |> Enum.max_by(&length/1, fn -> [] end)
    |> group_to_names(mod_names)
  end

  def largest_cyclic_deps_group(app, db) do
    {_, dsm, mod_names} = gen_dsm(app, "*", db, true)
    largest_cyclic_deps_group(dsm, mod_names)
  end

  def group_to_names(group, mod_names) do
    Enum.map(group, fn mod -> Map.get(mod_names, mod) end)
  end

  def groups_to_names(groups, mod_names) do
    Enum.map(groups, &group_to_names(&1, mod_names))
  end
end