lib/dna/dna.ex

defmodule Bio.Sequence.Dna do
  @moduledoc """
  A module for working with DNA.

  This module doesn't contain a representative struct, as with `Bio.Sequence.Rna`.
  This is because there are multiple ways to interpret a string as DNA. Namely, it
  can either be single or double stranded. This is why the
  `Bio.Sequence.DnaStrand` and `Bio.Sequence.DnaDoubleStrand` modules exist.

  However, this is the interface for dealing with things like `complement/1` and
  `reverse_complement!/1`.

  Additionally, this module handles defining default conversions for the DNA
  sequence types into RNA sequence types (`Bio.Sequence.RnaStrand` and
  `Bio.Sequence.RnaDoubleStrand`). Conversions defined here are used by the
  `Bio.Sequence.DnaStrand` and `Bio.Sequence.DnaDoubleStrand` modules.

  The default conversions use conventional nucleotides and map them to their
  relevant RNA nucleotides:

  ```
  a -> a
  t -> u
  g -> g
  c -> c
  ```

  Casing is preserved, so mixed case sequences will not be altered.

  # Example

      iex>DnaStrand.new("taTTg")
      ...>|> Bio.Polymer.convert(RnaStrand)
      {:ok, %RnaStrand{sequence: "uaUUg", length: 5}}

  This is guaranteed, so you may encode these with intention and assume that
  they are preserved across conversions.
  """
  alias Bio.Sequence.{DnaStrand, RnaStrand, RnaDoubleStrand}
  alias Bio.Enum, as: Bnum
  alias Bio.Sequence.Alphabets.Dna, as: Alpha

  @type complementable :: struct() | String.t()
  @type msg :: atom()

  defmodule Conversions do
    @moduledoc false
    use Bio.Convertible do
      def to(RnaStrand), do: {:ok, &to_rna/2, 1}
      def to(RnaDoubleStrand), do: {:ok, &to_rna/2, 1}
    end

    defp to_rna({:error, err}, _) do
      {:error, err}
    end

    defp to_rna({:ok, knumeration, data}, module) do
      knumeration
      |> Enum.map(fn base ->
        case base do
          "A" -> "A"
          "T" -> "U"
          "G" -> "G"
          "C" -> "C"
          "a" -> "a"
          "t" -> "u"
          "g" -> "g"
          "c" -> "c"
        end
      end)
      |> Enum.join()
      |> new_sequence(data, module)
    end

    defp new_sequence(seq, data, module) do
      apply(module, :new, [seq, Map.to_list(data)])
    end
  end

  @doc """
  Provide the DNA complement to a sequence.

  Given a sequence that is either a binary or a `Bio.Sequence.DnaStrand`,
  returns the DNA complement as defined by the standard nucleotide complements.

  # Examples
      iex>Dna.complement("attgacgt")
      {:ok, "taactgca"}

      iex>DnaStrand.new("attgacgt")
      ...>|> Dna.complement()
      {:ok, %DnaStrand{sequence: "taactgca", length: 8, alphabet: Bio.Sequence.Alphabets.Dna.common()}}
  """
  @spec complement(complementable) ::
          complementable | {:error, [{atom(), String.t(), integer(), String.t()}]}
  def complement(sequence, opts \\ [])

  def complement(%DnaStrand{alphabet: alpha} = sequence, opts) do
    alphabet = get_alpha({alpha, Keyword.get(opts, :alphabet)})

    comps =
      sequence
      |> Enum.with_index()
      |> Enum.map(fn {base, index} ->
        {Alpha.complement(base, alphabet), index}
      end)

    cond do
      Enum.any?(comps, fn {{status, _}, _} -> status == :error end) ->
        {:error,
         Enum.reduce(comps, [], fn {{status, result}, index}, acc ->
           case status do
             :ok ->
               acc

             :error ->
               {_, char, alpha} = result
               List.insert_at(acc, -1, {:mismatch_alpha, char, index, alpha})
           end
         end)}

      true ->
        {:ok,
         Enum.reduce(comps, "", fn {{_, result}, _}, acc ->
           acc <> result
         end)
         |> DnaStrand.new(alphabet: alphabet)}
    end
  end

  def complement(sequence, opts) when is_binary(sequence) do
    alphabet = get_alpha({nil, Keyword.get(opts, :alphabet)})

    comps =
      sequence
      |> String.graphemes()
      |> Enum.with_index()
      |> Enum.map(fn {base, index} ->
        {Alpha.complement(base, alphabet), index}
      end)

    cond do
      Enum.any?(comps, fn {{status, _}, _} -> status == :error end) ->
        {:error,
         Enum.reduce(comps, [], fn {{status, result}, index}, acc ->
           case status do
             :ok ->
               acc

             :error ->
               {_, char, alpha} = result
               List.insert_at(acc, -1, {:mismatch_alpha, char, index, alpha})
           end
         end)}

      true ->
        {:ok,
         Enum.reduce(comps, "", fn {{_, result}, _}, acc ->
           acc <> result
         end)}
    end
  end

  @doc """
  Provide the DNA reverse complement to a sequence.

  Given a sequence that is either a binary or a `Bio.Sequence.DnaStrand`,
  returns the DNA reverse complement as defined by the standard nucleotide
  complements.

  # Examples
      iex>Dna.reverse_complement!("attgacgt")
      "acgtcaat"

      iex>DnaStrand.new("attgacgt")
      ...>|> Dna.reverse_complement!()
      %DnaStrand{sequence: "acgtcaat", length: 8}
  """
  @spec reverse_complement!(sequence :: complementable) :: complementable
  def reverse_complement!(sequence, opts \\ [])

  def reverse_complement!(%DnaStrand{alphabet: alpha} = sequence, opts) do
    alphabet = get_alpha({alpha, Keyword.get(opts, :alphabet)})

    sequence
    |> Bnum.map(fn base ->
      {:ok, complementary} = Alpha.complement(base, alphabet)
      complementary
    end)
    |> Bnum.reverse()
  end

  def reverse_complement!(sequence, opts) when is_binary(sequence) do
    alphabet = get_alpha({nil, Keyword.get(opts, :alphabet)})

    sequence
    |> String.graphemes()
    |> Enum.map(fn base ->
      {:ok, complementary} = Alpha.complement(base, alphabet)
      complementary
    end)
    |> Enum.reverse()
    |> Enum.join()
  end

  defp get_alpha(opts) do
    case opts do
      {nil, nil} -> Alpha.common()
      {nil, opted} -> opted
      {built, nil} -> built
      {_built, opted} -> opted
    end
  end
end