lib/alphabets.ex

# TODO: add some docs about complementation
defmodule Bio.Sequence.Alphabets do
  @moduledoc """
  Alphabets relevant to the sequences, coding schemes are expressed in
  essentially [BNF](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
  Values and interpretations for the scheme were accessed from
  [here](https://www.insdc.org/submitting-standards/feature-table/).

  Also exposes the complementary elements for DNA/RNA allowing strands to be
  complemented. These functions shouldn't be used directly, but look at
  `Bio.Sequence.Dna.complement/2` and `Bio.Sequence.Rna.complement/1` for more
  information.

  Alphabets may be used in the declaration of `Bio.BaseSequence` structs to
  define how they should be validated. In case one is not supplied, a default
  may be preferred. See `Bio.Sequence.Dna`, `Bio.Sequence.Rna`,
  `Bio.Sequence.AminoAcid`, and `Bio.Polymer.valid?/2` for more information.

  - `Bio.Sequence.Dna`
    The DNA alphabets provided are:
    - `common` - The standard bases `ATGCatgc`
    - `with_n` - The standard alphabet, but with the ambiguous "any" character
    `Nn`
    - `iupac` - The IUPAC standard values `ACGTRYSWKMBDHVNacgtryswkmbdhvn`

  - `Bio.Sequence.Rna`
    - `common` - The standard bases `ACGUacgu`
    - `with_n` - The standard alphabet, but with the ambiguous "any" character
    `Nn`
    - `iupac` - The IUPAC standard values `ACGURYSWKMBDHVNacguryswkmbdhvn`

  - `Bio.Sequence.AminoAcid`
    - `common` - The standad 20 amino acid codes `ARNDCEQGHILKMFPSTWYVarndceqghilkmfpstwyv`
    - `iupac` - `ABCDEFGHJIKLMNPQRSTVWXYZabcdefghjiklmnpqrstvwxyz`

  # Coding Schemes
  ## Deoxyribonucleic Acid codes
  ```
  A ::= Adenine
  C ::= Cytosine
  G ::= Guanine
  T ::= Thymine

  R ::= A | G
  Y ::= C | T
  S ::= G | C
  W ::= A | T
  K ::= G | T
  M ::= A | C

  B ::= S | T (¬A)
  D ::= R | T (¬C)
  H ::= M | T (¬G)
  V ::= M | G (¬T)
  N ::= ANY
  ```

  ## Ribonucleic Acid codes
  ```
  A ::= Adenine
  C ::= Cytosine
  G ::= Guanine
  U ::= Uracil

  R ::= A | G
  Y ::= C | U
  S ::= G | C
  W ::= A | U
  K ::= G | U
  M ::= A | C

  B ::= S | U (¬A)
  D ::= R | U (¬C)
  H ::= M | U (¬G)
  V ::= M | G (¬U)
  N ::= ANY
  ```

  ## Amino Acid codes
  ```
  A ::= Alanine
  C ::= Cysteine
  D ::= Aspartic Acid
  E ::= Glutamic Acid
  F ::= Phenylalanine
  G ::= Glycine
  H ::= Histidine
  I ::= Isoleucine
  K ::= Lysine
  L ::= Leucine
  M ::= Methionine
  N ::= Asparagine
  P ::= Proline
  Q ::= Glutamine
  R ::= Arginine
  S ::= Serine
  T ::= Threonine
  V ::= Valine
  W ::= Tryptophan
  Y ::= Tyrosine

  B ::= D | N
  Z ::= Q | E
  J ::= I | L
  X ::=  ANY
  ```
  """
  defmodule Dna do
    @moduledoc """
    DNA Alphabets
    """
    @common "ATGCatgc"
    @with_n "ACGTNacgtn"
    @iupac "ACGTRYSWKMBDHVNacgtryswkmbdhvn"

    @common_complement %{
      "a" => "t",
      "A" => "T",
      "t" => "a",
      "T" => "A",
      "g" => "c",
      "G" => "C",
      "c" => "g",
      "C" => "G"
    }
    @with_n_complement Map.merge(@common_complement, %{"N" => "N", "n" => "n"})
    @iupac_complement Map.merge(@with_n_complement, %{
                        "R" => "Y",
                        "Y" => "R",
                        "W" => "W",
                        "S" => "S",
                        "K" => "M",
                        "M" => "K",
                        "D" => "H",
                        "V" => "B",
                        "H" => "D",
                        "B" => "V",
                        "r" => "y",
                        "y" => "r",
                        "w" => "w",
                        "s" => "s",
                        "k" => "m",
                        "m" => "k",
                        "d" => "h",
                        "v" => "b",
                        "h" => "d",
                        "b" => "v"
                      })

    @doc """
    #{@common}
    """
    @spec common() :: String.t()
    def common, do: @common

    @doc """
    #{@with_n}
    """
    @spec with_n() :: String.t()
    def with_n, do: @with_n

    @doc """
    #{@iupac}
    """
    @spec iupac() :: String.t()
    def iupac, do: @iupac

    @doc """
    Complements a given character according to the supplied alphabet.

    Alphabet must be one of the valid `Bio.Sequence.Alphabets.Dna` options.
    """
    @spec complement(String.t(), String.t()) ::
            {:error, {:unknown_code, String.t(), String.t()}} | {:ok, String.t()}
    def complement(base, alpha)

    def complement(base, @common) do
      gets(base, @common_complement, @common)
    end

    def complement(base, @with_n) do
      gets(base, @with_n_complement, @with_n)
    end

    def complement(base, @iupac) do
      gets(base, @iupac_complement, @iupac)
    end

    defp gets(base, map, alpha) do
      case Map.get(map, base) do
        nil -> {:error, {:unknown_code, base, alpha}}
        char -> {:ok, char}
      end
    end
  end

  defmodule Rna do
    @moduledoc """
    RNA Alphabets
    """
    @common "ACGUacgu"
    @with_n "ACGUNacgun"
    @iupac "ACGURYSWKMBDHVNZacguryswkmbdhvnz"

    @common_complement %{
      "a" => "u",
      "A" => "U",
      "u" => "a",
      "U" => "A",
      "g" => "c",
      "G" => "C",
      "c" => "g",
      "C" => "G"
    }
    @with_n_complement Map.merge(@common_complement, %{"N" => "N", "n" => "n"})
    @iupac_complement Map.merge(@with_n_complement, %{
                        "R" => "Y",
                        "Y" => "R",
                        "W" => "W",
                        "S" => "S",
                        "K" => "M",
                        "M" => "K",
                        "D" => "H",
                        "V" => "B",
                        "H" => "D",
                        "B" => "V",
                        "r" => "y",
                        "y" => "r",
                        "w" => "w",
                        "s" => "s",
                        "k" => "m",
                        "m" => "k",
                        "d" => "h",
                        "v" => "b",
                        "h" => "d",
                        "b" => "v"
                      })

    @doc """
    #{@common}
    """
    @spec common() :: String.t()
    def common, do: @common

    @doc """
    #{@with_n}
    """
    @spec with_n() :: String.t()
    def with_n, do: @with_n

    @doc """
    #{@iupac}
    """
    @spec iupac() :: String.t()
    def iupac, do: @iupac

    @doc """
    Complements a given character according to the supplied alphabet.

    Alphabet must be one of the valid `Bio.Sequence.Alphabets.Rna` options.
    """
    @spec complement(String.t(), String.t()) ::
            {:error, {:unknown_code, String.t(), String.t()}} | {:ok, String.t()}
    def complement(base, alpha)

    def complement(base, @common) do
      gets(base, @common_complement, @common)
    end

    def complement(base, @with_n) do
      gets(base, @with_n_complement, @with_n)
    end

    def complement(base, @iupac) do
      gets(base, @iupac_complement, @iupac)
    end

    defp gets(base, map, alpha) do
      case Map.get(map, base) do
        nil -> {:error, {:unknown_code, base, alpha}}
        char -> {:ok, char}
      end
    end
  end

  defmodule AminoAcid do
    @moduledoc """
    Amino Acid Alphabets
    """
    @common "ARNDCEQGHILKMFPSTWYVarndceqghilkmfpstwyv"
    @iupac "ABCDEFGHJIKLMNPQRSTVWXYZabcdefghJiklmnpqrstvwxyz"

    @doc """
    #{@common}
    """
    @spec common() :: String.t()
    def common, do: @common

    @doc """
    #{@iupac}
    """
    @spec iupac() :: String.t()
    def iupac, do: @iupac
  end
end