lib/analyzer_module.ex

# Copyright (C) 2020 by the Georgia Tech Research Institute (GTRI)
# This software may be modified and distributed under the terms of
# the BSD 3-Clause license. See the LICENSE file for details.

defmodule AnalyzerModule do
  @moduledoc """
  Analyzer takes in a valid repo URL and coordinates the analysis,
  returning a simple JSON report.  The URL can be one of "https", "http",
  or "file".  Note, that the latter scheme will only work an existing clone
  and won't remove the directory structure upon completion of analysis.
  """
  require Logger

  @spec analyze(binary | maybe_improper_list, any, any) :: {:ok, map}
  def analyze(url, source, options) when is_binary(url) do
    count =
      if Map.has_key?(options, :counter) do
        CounterAgent.click(options[:counter])
        CounterAgent.get(options[:counter])
      end

    Temp.track!()

    start_time = DateTime.utc_now()

    # Return summary report as JSON
    # Workaround to allow `mix analyze` to work even that :application doesn't exist
    library_version =
      if :application.get_application() != :undefined,
        do: elem(:application.get_key(:lowendinsight, :vsn), 1) |> List.to_string(),
        else: ""

    try do
      url = URI.decode(url)

      # Prevent a clone if configuration isn't found, forces an ArgumentError
      # "could not fetch application environment :critical_contributor_level
      #  for application :lowendinsight because the application was not loaded/started.
      #  If your application depends on :lowendinsight at runtime, make sure to
      #  load/start it or list it under :extra_applications in your mix.exs file"
      # _config = Application.fetch_env!(:lowendinsight, :critical_contributor_level)

      uri = URI.parse(url)

      {:ok, repo} =
        cond do
          uri.scheme == "file" ->
            GitModule.get_repo(uri.path)

          uri.scheme == "https" or uri.scheme == "http" or uri.scheme == "git+https" or
              uri.scheme == "git" ->
            url = Helpers.remove_git_prefix(url)

            # if Helpers.count_forward_slashes(url) > 4 do
            #   Logger.error("Not a Git repo URL, is a subdirectory")
            #   raise ArgumentError, message: "Not a Git repo URL, is a subdirectory"
            # end

            tmp =
              Temp.mkdir(%{
                prefix: "lei",
                basedir: Application.fetch_env!(:lowendinsight, :base_temp_dir) || "/tmp"
              })

            tmp_path =
              case tmp do
                {:ok, tmp_path} ->
                  tmp_path

                {:error, :enospc} ->
                  raise ArgumentError, message: "Failed to create a temp path for clone, no space"

                {:error, :enoent} ->
                  raise ArgumentError, message: "Failed to create a temp path for clone"
              end

            GitModule.clone_repo(url, tmp_path)

          true ->
            # raise ArgumentError, message: "Not a public Git repo URL"
            {:ok}
        end

      Logger.info("Cloned -> #{count}: #{url}")

      # Get Files analysis
      files_analysis = Lowendinsight.Files.analyze_files(repo.path)

      # Get SBOM risk
      sbom_risk =
        case SbomModule.has_sbom?(repo) do
          false ->
            {:ok, sbom_risk_level} = RiskLogic.sbom_risk()
            sbom_risk_level
          true ->
            "low"
        end

      # Get unique contributors count
      {:ok, count} = GitModule.get_contributor_count(repo)

      # Get risk rating for count
      {:ok, count_risk} = RiskLogic.contributor_risk(count)

      # Get last commit in weeks
      {:ok, date} = GitModule.get_last_commit_date(repo)
      weeks = TimeHelper.get_commit_delta(date) |> TimeHelper.sec_to_weeks()

      # Get risk rating for last commit
      {:ok, delta_risk} = RiskLogic.commit_currency_risk(weeks)

      # Get risk rating for size of last commit
      {:ok, lines_percent, _file_percent} = GitModule.get_recent_changes(repo)
      {:ok, changes_risk} = RiskLogic.commit_change_size_risk(lines_percent)

      # get risk rating for number of contributors with over a certain percentage of commits
      {:ok, num_filtered_contributors, functional_contributors} =
        GitModule.get_functional_contributors(repo)

      {:ok, filtered_contributors_risk} =
        RiskLogic.functional_contributors_risk(num_filtered_contributors)

      {:ok, top10_contributors} = GitModule.get_top10_contributors_map(repo)

      project_types_identified =
        case Map.has_key?(options, :types) && options.types == true do
          true ->
            ProjectIdent.get_project_types_identified(repo)

          false ->
            []
        end

      {:ok, repo_size} = GitModule.get_repo_size(repo)
      {:ok, git_hash} = GitModule.get_hash(repo)
      {:ok, default_branch} = GitModule.get_default_branch(repo)
      {:ok, last_commit} = GitModule.get_last_commit_date(repo)
      {:ok, total_commits} = GitModule.get_total_commit_count(repo)

      if uri.scheme == "https" or uri.scheme == "http" do
        GitModule.delete_repo(repo)
      end

      end_time = DateTime.utc_now()
      duration = DateTime.diff(end_time, start_time)

      config =
        if Application.get_all_env(:lowendinsight) == [],
          do: %{info: "no config loaded, defaults in use"},
          else: Application.get_all_env(:lowendinsight)

      report = %{
        header: %{
          repo: url,
          start_time: DateTime.to_iso8601(start_time),
          end_time: DateTime.to_iso8601(end_time),
          duration: duration,
          uuid: UUID.uuid1(),
          source_client: source,
          library_version: library_version
        },
        data: %{
          config: Helpers.convert_config_to_list(config),
          repo: url,
          files: files_analysis,
          git: %{
            hash: git_hash,
            default_branch: default_branch,
            last_commit_date: last_commit,
            total_commits_on_default_branch: total_commits
          },
          project_types: project_types_identified,
          repo_size: repo_size,
          results: %{
            contributor_count: count,
            contributor_risk: count_risk,
            commit_currency_weeks: weeks,
            commit_currency_risk: delta_risk,
            large_recent_commit_risk: changes_risk,
            recent_commit_size_in_percent_of_codebase: lines_percent,
            functional_contributors_risk: filtered_contributors_risk,
            functional_contributors: num_filtered_contributors,
            functional_contributor_names: functional_contributors,
            top10_contributors: top10_contributors,
            sbom_risk: sbom_risk
          }
        }
      }

      Temp.cleanup()
      {:ok, determine_toplevel_risk(report)}
    rescue
      MatchError ->
        end_time = DateTime.utc_now()
        duration = DateTime.diff(end_time, start_time)

        {:ok,
         %{
           header: %{
             repo: url,
             start_time: DateTime.to_iso8601(start_time),
             end_time: DateTime.to_iso8601(end_time),
             duration: duration,
             uuid: UUID.uuid1(),
             source_client: source,
             library_version: library_version
           },
           data: %{
             # config: Helpers.convert_config_to_list(Application.get_all_env(:lowendinsight)),
             error: "Unable to analyze the repo (#{url}), is this a valid Git repo URL?",
             repo: url,
             git: %{},
             risk: "undetermined",
             project_types: %{"undetermined" => "undetermined"},
             repo_size: "undetermined"
           }
         }}

      e in ArgumentError ->
        end_time = DateTime.utc_now()
        duration = DateTime.diff(end_time, start_time)

        {:ok,
         %{
           header: %{
             repo: url,
             start_time: DateTime.to_iso8601(start_time),
             end_time: DateTime.to_iso8601(end_time),
             duration: duration,
             uuid: UUID.uuid1(),
             source_client: source,
             library_version: library_version
           },
           data: %{
             # config: Helpers.convert_config_to_list(Application.get_all_env(:lowendinsight)),
             error: "Unable to analyze the repo (#{url}). #{e.message}",
             repo: url,
             git: %{},
             risk: "undetermined",
             project_types: %{"undetermined" => "undetermined"},
             repo_size: "undetermined"
           }
         }}
    after
      Temp.cleanup()
    end
  end

  @doc """
  analyze/3: returns the LowEndInsight report as JSON for multiple_repos.  Takes in a "list" of
  urls, a source id for the calling client, and the start_time of analysis as an optional way
  to capture the time actually started at whatever the client is (e.g. an async API).

  Returns Map.

  ## Examples
    ```
    iex> {:ok, report} = AnalyzerModule.analyze(["https://github.com/kitplummer/xmpp4rails","https://github.com/kitplummer/lita-cron"], "iex")
    iex> _count = report[:metadata][:repo_count]
    2
    ```
  """
  # @defaults %{start_time: DateTime.utc_now()}
  @spec analyze([binary], any, any, any) :: {:ok, map}
  def analyze(urls, source \\ "lei", start_time \\ DateTime.utc_now(), options \\ %{})
      when is_list(urls) do
    ## Concurrency for parallelizing the analysis. This is the magic.
    ## Will run two jobs per core available max...

    {:ok, counter} = CounterAgent.new()
    options = Map.put(options, :counter, counter)

    max_concurrency =
      System.schedulers_online() *
        (Application.get_env(:lowendinsight, :jobs_per_core_max) || 1)

    l =
      urls
      |> Task.async_stream(__MODULE__, :analyze, [source, options],
        timeout: :infinity,
        max_concurrency: max_concurrency
      )
      |> Enum.map(fn {:ok, report} -> elem(report, 1) end)

    report = %{
      state: "complete",
      report: %{uuid: UUID.uuid1(), repos: l},
      metadata: %{repo_count: length(l)}
    }

    report = determine_risk_counts(report)
    end_time = DateTime.utc_now()
    duration = DateTime.diff(end_time, start_time)

    times = %{
      start_time: DateTime.to_iso8601(start_time),
      end_time: DateTime.to_iso8601(end_time),
      duration: duration
    }

    metadata = Map.put_new(report[:metadata], :times, times)
    report = report |> Map.put(:metadata, metadata)

    {:ok, report}
  end

  @doc """
  create_empty_report/3: takes in a uuid, list of urls, and a start time and
  produces the repo report object to be returned immediately by asynchronous
  requestors (e.g. LowEndInsight-Get HTTP endpoint)
  """
  @spec create_empty_report(String.t(), [String.t()], any) :: map
  def create_empty_report(uuid, urls, start_time \\ DateTime.utc_now()) do
    %{
      :metadata => %{
        :times => %{
          :duration => 0,
          :start_time => DateTime.to_iso8601(start_time),
          :end_time => ""
        }
      },
      :uuid => uuid,
      :state => "incomplete",
      :report => %{
        :repos => urls |> Enum.map(fn url -> %{:data => %{:repo => url}} end)
      }
    }
  end

  @doc """
  determine_risk_counts/1: takes in a full report of n-repo reports, and calculates
  the number or risk ratings, given the number of repos.  It returns a new report
  with the risk_counts object populated with the count table.  Have to accommodate
  both the atom and string elements, because the JSON gets parsed into the string
  format - so caching can be supported (as reports are stored in JSON).
  """
  @spec determine_risk_counts(RepoReport.t()) :: map
  def determine_risk_counts(report) do
    count_map =
      report[:report][:repos]
      |> Enum.map(fn repo -> repo.data.risk end)
      |> Enum.reduce(%{}, fn x, acc -> Map.update(acc, x, 1, &(&1 + 1)) end)

    metadata = Map.put_new(report[:metadata], :risk_counts, count_map)
    report |> Map.put(:metadata, metadata)
  end

  @doc """
  determine_toplevel_risk/1: takes in a report and determines the highest
  criticality, and assigns it to the "risk" element for the repo report.
  """
  @spec determine_toplevel_risk(RepoReport.t()) :: map
  def determine_toplevel_risk(report) do
    values = Map.values(report[:data][:results])

    risk =
      cond do
        Enum.member?(values, "critical") -> "critical"
        Enum.member?(values, "high") -> "high"
        Enum.member?(values, "medium") -> "medium"
        true -> "low"
      end

    data = report[:data]
    data = Map.put_new(data, :risk, risk)
    report |> Map.put(:header, report[:header]) |> Map.put(:data, data)
  end
end