diff --git a/.travis/script.sh b/.travis/script.sh
index bb65b45c006..338bf2e0bf8 100755
--- a/.travis/script.sh
+++ b/.travis/script.sh
@@ -37,7 +37,8 @@ git diff --exit-code
 
 # check auto-gen files up-to-date
 python onnx/defs/gen_doc.py
-python onnx/gen_proto.py
+python onnx/gen_proto.py -l
+python onnx/gen_proto.py -l --ml
 python onnx/backend/test/stat_coverage.py
 backend-test-tools generate-data
 git status
diff --git a/appveyor.yml b/appveyor.yml
index 5fee9eb7e6f..174e8d8cd31 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -63,7 +63,8 @@ build_script:
 - cmd: pip install %_wheel%
 - cmd: pytest
 - cmd: python onnx/defs/gen_doc.py
-- cmd: python onnx/gen_proto.py
+- cmd: python onnx/gen_proto.py -l
+- cmd: python onnx/gen_proto.py -l --ml
 # Run type checks
 - cmd: pip uninstall -y %_wheel%
 - cmd: rm -rf .setuptools-cmake-build
diff --git a/docs/Changelog.md b/docs/Changelog.md
index d1ef66fe776..8477a130d1c 100644
--- a/docs/Changelog.md
+++ b/docs/Changelog.md
@@ -14044,8 +14044,12 @@ This version of the operator has been available since version 12 of the default
 ### <a name="Celu-12"></a>**Celu-12**</a>
 
   Continuously Differentiable Exponential Linear Units:
-         Perform the linear unit element-wise on the input tensor X
-         using formula: <br/> ``` max(0,x) + min(0,alpha*(exp(x/alpha)−1)) ```
+  Perform the linear unit element-wise on the input tensor X
+  using formula: 
+  
+  ```
+  max(0,x) + min(0,alpha*(exp(x/alpha)-1))
+  ```
 
 #### Version
 
@@ -14526,3 +14530,292 @@ This version of the operator has been available since version 12 of the default
 <dd>Constrain input and output types to high-precision and 8 bit numeric tensors.</dd>
 </dl>
 
+# ai.onnx.training
+## Version 1 of the 'ai.onnx.training' operator set
+### <a name="ai.onnx.training.Gradient-1"></a>**ai.onnx.training.Gradient-1**</a>
+
+  Gradient operator computes the partial derivatives of a specific tensor w.r.t.
+  some other tensors. This operator is widely used in gradient-based training
+  algorithms. To illustrate its use, let's consider a computation graph,
+  
+  ```
+  X -----.
+         |
+         v
+  W --> Conv --> H --> Gemm --> Y
+                        ^
+                        |
+                        Z
+  ```
+  
+  , where W and Z are trainable tensors. Note that operators' attributes are
+  omitted for the sake of simplicity. Let dY/dW (dY/dZ) be the gradient of
+  Y with respect to W (Z). The user can compute gradient by inserting Gradient
+  operator to form another graph shown below.
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+  |      ^              ^
+  |      |              |
+  |      X              Z
+  |      |              |
+  |      |   .----------'
+  |      |   |  (W/Z/X is the 1st/2nd/3rd input of Gradient as shown in
+  |      |   |   "xs" followed by "zs")
+  |      v   v
+  '---> Gradient(xs=["W", "Z"], zs=["X"], y="Y")
+         |   |
+         |   '-----------------------------------> dY/dW (1st output of Gradient)
+         |
+         '---------------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  By definition, the tensor "y" is a function of independent variables in "xs"
+  and "zs". Since we only compute the gradient of "y" w.r.t. the differentiable
+  variables in "xs", this Gradient only outputs dY/dW and dY/dZ. Note that "H"
+  cannot appear in "xs" and "zs". The reason is that "H" can be determined by
+  tensors "W" and "X" and therefore "H" is not an independent variable.
+  
+  All outputs are optional. If needed, for example, user can assign an empty
+  string to the 1st output name of that Gradient to skip the generation of dY/dW.
+  Note that the concept of optional outputs can also be found in ONNX's RNN, GRU,
+  and LSTM.
+  
+  Gradient operator can compute derivative against intermediate tensors. For
+  example, the gradient of Y with respect to H can be done via
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+         ^       |      ^
+         |       |      |
+         X       |      Z
+         .-------'      |
+         |   .----------'
+         |   | (H/Z is the 1st/2nd input of Gradient as shown in "xs")
+         v   v
+        Gradient(xs=["H", "Z"], y="Y")
+         |   |
+         |   '-----------------------------------> dY/dH (1st output of Gradient)
+         |
+         '---------------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  It is possible to represent high-order differentiation using Gradient operators.
+  For example, given the following linear model:
+  
+  ```
+  W --> Gemm --> Y --> Loss --> O
+         ^              ^
+         |              |
+         X              L
+  ```
+  
+  To compute the 2nd order derivative of O with respect to W (denoted by
+  d^2O/dW^2), one can do
+  
+  ```
+  W --> Gemm --> Y --> Loss --> O
+  |      ^              ^
+  |      |              |
+  |      X .------------L
+  |      | |            |
+  |      | |            v
+  +------+-+> Gradient(xs=["X", "W"], zs=["L"], y="O") ---> dO/dX (1st output of Gradient)
+  |      | |    |
+  |      | |    '---> dO/dW (2nd output of Gradient)
+  |      v v
+  '---> Gradient(xs=["X", "W"], zs=["L"], y="dO/dW") ---> d(dO/dW)dX (1st output of
+         |                                                  Gradient)
+         |
+         |
+         '---> d^2O/dW^2 (2nd output of Gradient)
+  ```
+  
+  The tensors named in attributes "xs", "zs", and "y" define the differentiated
+  computation graph, and the inputs to Gradient node define the values at
+  which the gradient is computed. We can feed different tensors to the identified
+  graph. For example, one can compute the gradient of Y with respect to H at 
+  a specific value of H, H_1, by providing that value as an input to the Gradient
+  node.
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+         ^              ^
+         |              |
+         X              Z
+  
+            Z_1 (2nd input of Gradient)
+             |
+             v
+  H_1 --> Gradient(xs=["H", "Z"], y="Y") ---> dY/dH when H = H_1 and Y = Y_1.
+             |
+             '------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  When the inputs of Gradient are the tensors named in "xs" and "zs", the
+  computation can be optimized. More specifically, intermediate variables in
+  forward pass can be reused if the gradient is computed via reverse-mode
+  auto-differentiation.
+  
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'ai.onnx.training' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>xs</tt> : list of strings (required)</dt>
+<dd>Input tensor names of the differentiated sub-graph. It contains only the necessary differentiated inputs of a (sub-)graph. Variables (usually called intermediate variables) that can be generated from inputs cannot be included in this attribute.</dd>
+<dt><tt>y</tt> : string (required)</dt>
+<dd>The targeted tensor. It can be viewed as the output of the differentiated function. The attribute "xs" and attribute "zs" are the minimal independent variable set that determines the value of "y".</dd>
+<dt><tt>zs</tt> : list of strings</dt>
+<dd>Input tensor names of the differentiated sub-graph. It contains only the necessary non-differentiated inputs of a (sub-)graph. Variables (usually called intermediate variables) that can be generated from inputs cannot be included in this attribute.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic, heterogeneous) : T1</dt>
+<dd>The values fed into graph identified by the attributes. The i-th input is the value of the i-th tensor specified in the concatenated list of the attribute "xs" and the attribute  "zs". For example, if xs=["A", "B"] and zs=["C"], the first input is used as the value of symbol "A" and the 3rd input is substituted for all the occurrences of "C".</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Outputs</tt> (variadic, heterogeneous) : T2</dt>
+<dd>The gradient of the tensor specified by the attribute "y" with respect to each of tensors specified in the attribute "xs". The i-th output is the gradient of "y" with respect to the i-th tensor specified in the attribute "xs".</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Allow outputs to be any kind of tensor.</dd>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Allow inputs to be any kind of floating-point tensor.</dd>
+</dl>
+
+### <a name="ai.onnx.training.GraphCall-1"></a>**ai.onnx.training.GraphCall-1**</a>
+
+  The GraphCall operator invokes a graph inside TrainingInfoProto's
+  algorithm field. The GraphCall inputs and outputs are bound to those of
+  invoked graph by position. If a graph input has an initializer, that input
+  is considered optional. All graph outputs are optional.
+  
+  Below Python syntax is used for describing dictionary and list.
+  
+  Assume that ModelProto's graph field has
+  - name: "MyInferenceGraph"
+  - input: ["X", "W", "Z"]
+  - initializer: [W]
+  - output: ["Y"]
+  
+  as visualized below for inference.
+  
+  ```
+  X -----.
+         |
+         v
+  W --> Conv --> H --> Gemm --> Y
+                        ^
+                        |
+                        Z
+  ```
+  
+  Assume that the training algorithm contains
+  
+  - inputs: ["X_1", "Z_1", "C"]
+  - initializer: [T]
+  - outputs: ["W_new"]
+  
+  with a dictionary
+  
+  - update_binding: {"W": "W_new", "T": "T_new"}
+  
+  Inside the training algorithm graph, one can invoke the inference
+  graph via adding a GraphCall node with
+  
+  - inputs: ["X_1", "W", Z_1"]
+  - outputs: ["Y_1"]
+  - an attribute graph_name="MyInferenceGraph",
+  
+  The initializers, "W" and "T" in this case, in update_binding
+  are considered globally-visible and mutable variables, which
+  can be used as inputs of operators in the training graph.
+  
+  An example training algorithm graph may look like
+  
+  ```
+  .-------- W (a global and mutable variable from
+  |         |  the inference graph)
+  |         |
+  |   .-----'-----------.
+  |   |                 |
+  |   |                 v
+  |   | .-- X_1 --> GraphCall(graph_name="MyInferenceGraph")
+  |   | |            |  |
+  |   | |            |  |
+  |   | |   Z_1 -----'  |
+  |   | |    |          V
+  |   | |    |         Y_1 ---> Loss ---> O
+  |   | |    |                    ^
+  |   | |    |                    |
+  |   | `--. |                    C
+  |   |    | |                    |
+  |   |    | |   .----------------'
+  |   |    | |   |
+  |   |    v v   v
+  |   `--> Gradient(xs=["W"], zs=["X_1", "Z_1", "C"], y="O")
+  |        |
+  |        v
+  |      dO_dW (gradient of W)      1 (a scalar one)
+  |        |                        |
+  |        V                        v
+  |       Div <--- T ------------> Add ---> T_new
+  |        |    (T is the number of training iterations.
+  |        |     T is also globally visible and mutable.)
+  |        v
+  `-----> Sub ----> W_new
+  ```
+  
+  where Loss is a dummy node which computes the minimized objective function.
+  
+  The variable "W" is an optional input in the called graph.
+  If the user omits it, the input list of GraphCall becomes ["X_1", "", "Z_1"].
+  In this case, from the view of computation graph, the Conv operator invoked by
+  GraphCall's may be still connected the global "W" variable and therefore the
+  structure of the computation graph is unchanged.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'ai.onnx.training' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>graph_name</tt> : string (required)</dt>
+<dd>The invoked graph's name. The only allowed value is the name of the inference graph, which is stored in "ModelProto.graph.name" in the ONNX model format.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic, heterogeneous) : T</dt>
+<dd>Inputs fed to the invoked graph. The i-th input here goes to the i-th input of the invoked graph. To omit an optional input in this field, the user can drop it or use an empty string.</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Outputs</tt> (variadic, heterogeneous) : T</dt>
+<dd>The outputs generated by the called graph. Its i-th value is bound to the i-th output of the called graph. Similar to the inputs, all outputs are optional.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Allow inputs and outputs to be any kind of tensor.</dd>
+</dl>
+
diff --git a/docs/Operators.md b/docs/Operators.md
index 328dfd20cd6..858422fd763 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -165,6 +165,9 @@
   * <a href="#MeanVarianceNormalization">MeanVarianceNormalization</a>
   * <a href="#NegativeLogLikelihoodLoss">NegativeLogLikelihoodLoss</a>
   * <a href="#Range">Range</a>
+* ai.onnx.training
+  * <a href="#ai.onnx.training.Gradient">ai.onnx.training.Gradient</a>
+  * <a href="#ai.onnx.training.GraphCall">ai.onnx.training.GraphCall</a>
 
 ## ai.onnx (default)
 ### <a name="Abs"></a><a name="abs">**Abs**</a>
@@ -2390,8 +2393,12 @@ expect(node, inputs=[x], outputs=[y],
 ### <a name="Celu"></a><a name="celu">**Celu**</a>
 
   Continuously Differentiable Exponential Linear Units:
-         Perform the linear unit element-wise on the input tensor X
-         using formula: <br/> ``` max(0,x) + min(0,alpha*(exp(x/alpha)−1)) ```
+  Perform the linear unit element-wise on the input tensor X
+  using formula: 
+  
+  ```
+  max(0,x) + min(0,alpha*(exp(x/alpha)-1))
+  ```
 
 #### Version
 
@@ -19945,3 +19952,397 @@ expect(node, inputs=[x, y], outputs=[z],
 </details>
 
 
+## ai.onnx.training
+### <a name="ai.onnx.training.Gradient"></a><a name="ai.onnx.training.gradient">**ai.onnx.training.Gradient**</a>
+
+  Gradient operator computes the partial derivatives of a specific tensor w.r.t.
+  some other tensors. This operator is widely used in gradient-based training
+  algorithms. To illustrate its use, let's consider a computation graph,
+  
+  ```
+  X -----.
+         |
+         v
+  W --> Conv --> H --> Gemm --> Y
+                        ^
+                        |
+                        Z
+  ```
+  
+  , where W and Z are trainable tensors. Note that operators' attributes are
+  omitted for the sake of simplicity. Let dY/dW (dY/dZ) be the gradient of
+  Y with respect to W (Z). The user can compute gradient by inserting Gradient
+  operator to form another graph shown below.
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+  |      ^              ^
+  |      |              |
+  |      X              Z
+  |      |              |
+  |      |   .----------'
+  |      |   |  (W/Z/X is the 1st/2nd/3rd input of Gradient as shown in
+  |      |   |   "xs" followed by "zs")
+  |      v   v
+  '---> Gradient(xs=["W", "Z"], zs=["X"], y="Y")
+         |   |
+         |   '-----------------------------------> dY/dW (1st output of Gradient)
+         |
+         '---------------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  By definition, the tensor "y" is a function of independent variables in "xs"
+  and "zs". Since we only compute the gradient of "y" w.r.t. the differentiable
+  variables in "xs", this Gradient only outputs dY/dW and dY/dZ. Note that "H"
+  cannot appear in "xs" and "zs". The reason is that "H" can be determined by
+  tensors "W" and "X" and therefore "H" is not an independent variable.
+  
+  All outputs are optional. If needed, for example, user can assign an empty
+  string to the 1st output name of that Gradient to skip the generation of dY/dW.
+  Note that the concept of optional outputs can also be found in ONNX's RNN, GRU,
+  and LSTM.
+  
+  Gradient operator can compute derivative against intermediate tensors. For
+  example, the gradient of Y with respect to H can be done via
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+         ^       |      ^
+         |       |      |
+         X       |      Z
+         .-------'      |
+         |   .----------'
+         |   | (H/Z is the 1st/2nd input of Gradient as shown in "xs")
+         v   v
+        Gradient(xs=["H", "Z"], y="Y")
+         |   |
+         |   '-----------------------------------> dY/dH (1st output of Gradient)
+         |
+         '---------------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  It is possible to represent high-order differentiation using Gradient operators.
+  For example, given the following linear model:
+  
+  ```
+  W --> Gemm --> Y --> Loss --> O
+         ^              ^
+         |              |
+         X              L
+  ```
+  
+  To compute the 2nd order derivative of O with respect to W (denoted by
+  d^2O/dW^2), one can do
+  
+  ```
+  W --> Gemm --> Y --> Loss --> O
+  |      ^              ^
+  |      |              |
+  |      X .------------L
+  |      | |            |
+  |      | |            v
+  +------+-+> Gradient(xs=["X", "W"], zs=["L"], y="O") ---> dO/dX (1st output of Gradient)
+  |      | |    |
+  |      | |    '---> dO/dW (2nd output of Gradient)
+  |      v v
+  '---> Gradient(xs=["X", "W"], zs=["L"], y="dO/dW") ---> d(dO/dW)dX (1st output of
+         |                                                  Gradient)
+         |
+         |
+         '---> d^2O/dW^2 (2nd output of Gradient)
+  ```
+  
+  The tensors named in attributes "xs", "zs", and "y" define the differentiated
+  computation graph, and the inputs to Gradient node define the values at
+  which the gradient is computed. We can feed different tensors to the identified
+  graph. For example, one can compute the gradient of Y with respect to H at 
+  a specific value of H, H_1, by providing that value as an input to the Gradient
+  node.
+  
+  ```
+  W --> Conv --> H --> Gemm --> Y
+         ^              ^
+         |              |
+         X              Z
+  
+            Z_1 (2nd input of Gradient)
+             |
+             v
+  H_1 --> Gradient(xs=["H", "Z"], y="Y") ---> dY/dH when H = H_1 and Y = Y_1.
+             |
+             '------------------------------> dY/dZ (2nd output of Gradient)
+  ```
+  
+  When the inputs of Gradient are the tensors named in "xs" and "zs", the
+  computation can be optimized. More specifically, intermediate variables in
+  forward pass can be reused if the gradient is computed via reverse-mode
+  auto-differentiation.
+  
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'ai.onnx.training' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>xs</tt> : list of strings (required)</dt>
+<dd>Input tensor names of the differentiated sub-graph. It contains only the necessary differentiated inputs of a (sub-)graph. Variables (usually called intermediate variables) that can be generated from inputs cannot be included in this attribute.</dd>
+<dt><tt>y</tt> : string (required)</dt>
+<dd>The targeted tensor. It can be viewed as the output of the differentiated function. The attribute "xs" and attribute "zs" are the minimal independent variable set that determines the value of "y".</dd>
+<dt><tt>zs</tt> : list of strings</dt>
+<dd>Input tensor names of the differentiated sub-graph. It contains only the necessary non-differentiated inputs of a (sub-)graph. Variables (usually called intermediate variables) that can be generated from inputs cannot be included in this attribute.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic, heterogeneous) : T1</dt>
+<dd>The values fed into graph identified by the attributes. The i-th input is the value of the i-th tensor specified in the concatenated list of the attribute "xs" and the attribute  "zs". For example, if xs=["A", "B"] and zs=["C"], the first input is used as the value of symbol "A" and the 3rd input is substituted for all the occurrences of "C".</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Outputs</tt> (variadic, heterogeneous) : T2</dt>
+<dd>The gradient of the tensor specified by the attribute "y" with respect to each of tensors specified in the attribute "xs". The i-th output is the gradient of "y" with respect to the i-th tensor specified in the attribute "xs".</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T1</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Allow outputs to be any kind of tensor.</dd>
+<dt><tt>T2</tt> : tensor(float16), tensor(float), tensor(double)</dt>
+<dd>Allow inputs to be any kind of floating-point tensor.</dd>
+</dl>
+
+
+#### Examples
+
+<details>
+<summary>gradient_scalar_add</summary>
+
+```python
+add_node = onnx.helper.make_node('Add',
+                                 ['a', 'b'], ['c'], name='my_add')
+gradient_node = onnx.helper.make_node(
+    'Gradient', ['a', 'b'],
+    ['dc_da', 'dc_db'], name='my_gradient',
+    domain='ai.onnx.training',
+    xs=['a', 'b'], y='c')
+
+a = np.array(1.0).astype(np.float32)
+b = np.array(2.0).astype(np.float32)
+c = a + b
+# dc / da = d(a+b) / da = 1
+dc_da = np.array(1).astype(np.float32)
+# db / db = d(a+b) / db = 1
+dc_db = np.array(1).astype(np.float32)
+
+graph = onnx.helper.make_graph(
+    nodes=[add_node, gradient_node],
+    name='GradientOfAdd',
+    inputs=[
+        onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                           [])],
+    outputs=[
+        onnx.helper.make_tensor_value_info('c', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('dc_da',
+                                           onnx.TensorProto.FLOAT, []),
+        onnx.helper.make_tensor_value_info('dc_db',
+                                           onnx.TensorProto.FLOAT, [])])
+opsets = [
+    onnx.helper.make_operatorsetid('', 12),
+    onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+model = onnx.helper.make_model(
+    graph,
+    producer_name='backend-test',
+    opset_imports=opsets)
+expect(model, inputs=[a, b], outputs=[c, dc_da, dc_db],
+       name='test_gradient_of_add')
+```
+
+</details>
+
+
+<details>
+<summary>gradient_scalar_add_and_mul</summary>
+
+```python
+add_node = onnx.helper.make_node('Add',
+                                 ['a', 'b'], ['c'], name='my_add')
+mul_node = onnx.helper.make_node('Mul',
+                                 ['c', 'a'], ['d'], name='my_mul')
+gradient_node = onnx.helper.make_node(
+    'Gradient', ['a', 'b'],
+    ['dd_da', 'dd_db'], name='my_gradient',
+    domain='ai.onnx.training',
+    xs=['a', 'b'], y='d')
+
+a = np.array(1.0).astype(np.float32)
+b = np.array(2.0).astype(np.float32)
+c = a + b
+# d = a * c = a * (a + b)
+d = a * c
+# dd / da = d(a*a+a*b) / da = 2 * a + b
+dd_da = 2 * a + b
+# dd / db = d(a*a+a*b) / db = a
+dd_db = a
+
+graph = onnx.helper.make_graph(
+    nodes=[add_node, mul_node, gradient_node],
+    name='GradientOfTwoOperators',
+    inputs=[
+        onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                           [])],
+    outputs=[
+        onnx.helper.make_tensor_value_info('d', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('dd_da',
+                                           onnx.TensorProto.FLOAT, []),
+        onnx.helper.make_tensor_value_info('dd_db',
+                                           onnx.TensorProto.FLOAT, [])])
+
+opsets = [
+    onnx.helper.make_operatorsetid('', 12),
+    onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+model = onnx.helper.make_model(graph,
+    producer_name='backend-test',
+    opset_imports=opsets)
+expect(model, inputs=[a, b], outputs=[d, dd_da, dd_db],
+       name='test_gradient_of_add_and_mul')
+```
+
+</details>
+
+
+### <a name="ai.onnx.training.GraphCall"></a><a name="ai.onnx.training.graphcall">**ai.onnx.training.GraphCall**</a>
+
+  The GraphCall operator invokes a graph inside TrainingInfoProto's
+  algorithm field. The GraphCall inputs and outputs are bound to those of
+  invoked graph by position. If a graph input has an initializer, that input
+  is considered optional. All graph outputs are optional.
+  
+  Below Python syntax is used for describing dictionary and list.
+  
+  Assume that ModelProto's graph field has
+  - name: "MyInferenceGraph"
+  - input: ["X", "W", "Z"]
+  - initializer: [W]
+  - output: ["Y"]
+  
+  as visualized below for inference.
+  
+  ```
+  X -----.
+         |
+         v
+  W --> Conv --> H --> Gemm --> Y
+                        ^
+                        |
+                        Z
+  ```
+  
+  Assume that the training algorithm contains
+  
+  - inputs: ["X_1", "Z_1", "C"]
+  - initializer: [T]
+  - outputs: ["W_new"]
+  
+  with a dictionary
+  
+  - update_binding: {"W": "W_new", "T": "T_new"}
+  
+  Inside the training algorithm graph, one can invoke the inference
+  graph via adding a GraphCall node with
+  
+  - inputs: ["X_1", "W", Z_1"]
+  - outputs: ["Y_1"]
+  - an attribute graph_name="MyInferenceGraph",
+  
+  The initializers, "W" and "T" in this case, in update_binding
+  are considered globally-visible and mutable variables, which
+  can be used as inputs of operators in the training graph.
+  
+  An example training algorithm graph may look like
+  
+  ```
+  .-------- W (a global and mutable variable from
+  |         |  the inference graph)
+  |         |
+  |   .-----'-----------.
+  |   |                 |
+  |   |                 v
+  |   | .-- X_1 --> GraphCall(graph_name="MyInferenceGraph")
+  |   | |            |  |
+  |   | |            |  |
+  |   | |   Z_1 -----'  |
+  |   | |    |          V
+  |   | |    |         Y_1 ---> Loss ---> O
+  |   | |    |                    ^
+  |   | |    |                    |
+  |   | `--. |                    C
+  |   |    | |                    |
+  |   |    | |   .----------------'
+  |   |    | |   |
+  |   |    v v   v
+  |   `--> Gradient(xs=["W"], zs=["X_1", "Z_1", "C"], y="O")
+  |        |
+  |        v
+  |      dO_dW (gradient of W)      1 (a scalar one)
+  |        |                        |
+  |        V                        v
+  |       Div <--- T ------------> Add ---> T_new
+  |        |    (T is the number of training iterations.
+  |        |     T is also globally visible and mutable.)
+  |        v
+  `-----> Sub ----> W_new
+  ```
+  
+  where Loss is a dummy node which computes the minimized objective function.
+  
+  The variable "W" is an optional input in the called graph.
+  If the user omits it, the input list of GraphCall becomes ["X_1", "", "Z_1"].
+  In this case, from the view of computation graph, the Conv operator invoked by
+  GraphCall's may be still connected the global "W" variable and therefore the
+  structure of the computation graph is unchanged.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'ai.onnx.training' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>graph_name</tt> : string (required)</dt>
+<dd>The invoked graph's name. The only allowed value is the name of the inference graph, which is stored in "ModelProto.graph.name" in the ONNX model format.</dd>
+</dl>
+
+#### Inputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Inputs</tt> (variadic, heterogeneous) : T</dt>
+<dd>Inputs fed to the invoked graph. The i-th input here goes to the i-th input of the invoked graph. To omit an optional input in this field, the user can drop it or use an empty string.</dd>
+</dl>
+
+#### Outputs (1 - &#8734;)
+
+<dl>
+<dt><tt>Outputs</tt> (variadic, heterogeneous) : T</dt>
+<dd>The outputs generated by the called graph. Its i-th value is bound to the i-th output of the called graph. Similar to the inputs, all outputs are optional.</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(uint8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(int8), tensor(int16), tensor(int32), tensor(int64), tensor(float16), tensor(float), tensor(double), tensor(string), tensor(bool), tensor(complex64), tensor(complex128)</dt>
+<dd>Allow inputs and outputs to be any kind of tensor.</dd>
+</dl>
+
+
diff --git a/docs/TestCoverage.md b/docs/TestCoverage.md
index 7f0604de90d..9f560097fd2 100644
--- a/docs/TestCoverage.md
+++ b/docs/TestCoverage.md
@@ -5,7 +5,7 @@
 * [Overall Test Coverage](#overall-test-coverage)
 # Node Test Coverage
 ## Summary
-Node tests have covered 139/154 (90.26%, 5 generators excluded) common operators.
+Node tests have covered 140/156 (89.74%, 5 generators excluded) common operators.
 
 Node tests have covered 0/0 (N/A) experimental operators.
 
@@ -3763,6 +3763,108 @@ expect(node, inputs=[x], outputs=[y], name='test_globalmaxpool_precomputed')
 </details>
 
 
+### Gradient
+There are 2 test cases, listed as following:
+<details>
+<summary>gradient_scalar_add</summary>
+
+```python
+add_node = onnx.helper.make_node('Add',
+                                 ['a', 'b'], ['c'], name='my_add')
+gradient_node = onnx.helper.make_node(
+    'Gradient', ['a', 'b'],
+    ['dc_da', 'dc_db'], name='my_gradient',
+    domain='ai.onnx.training',
+    xs=['a', 'b'], y='c')
+
+a = np.array(1.0).astype(np.float32)
+b = np.array(2.0).astype(np.float32)
+c = a + b
+# dc / da = d(a+b) / da = 1
+dc_da = np.array(1).astype(np.float32)
+# db / db = d(a+b) / db = 1
+dc_db = np.array(1).astype(np.float32)
+
+graph = onnx.helper.make_graph(
+    nodes=[add_node, gradient_node],
+    name='GradientOfAdd',
+    inputs=[
+        onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                           [])],
+    outputs=[
+        onnx.helper.make_tensor_value_info('c', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('dc_da',
+                                           onnx.TensorProto.FLOAT, []),
+        onnx.helper.make_tensor_value_info('dc_db',
+                                           onnx.TensorProto.FLOAT, [])])
+opsets = [
+    onnx.helper.make_operatorsetid('', 12),
+    onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+model = onnx.helper.make_model(
+    graph,
+    producer_name='backend-test',
+    opset_imports=opsets)
+expect(model, inputs=[a, b], outputs=[c, dc_da, dc_db],
+       name='test_gradient_of_add')
+```
+
+</details>
+<details>
+<summary>gradient_scalar_add_and_mul</summary>
+
+```python
+add_node = onnx.helper.make_node('Add',
+                                 ['a', 'b'], ['c'], name='my_add')
+mul_node = onnx.helper.make_node('Mul',
+                                 ['c', 'a'], ['d'], name='my_mul')
+gradient_node = onnx.helper.make_node(
+    'Gradient', ['a', 'b'],
+    ['dd_da', 'dd_db'], name='my_gradient',
+    domain='ai.onnx.training',
+    xs=['a', 'b'], y='d')
+
+a = np.array(1.0).astype(np.float32)
+b = np.array(2.0).astype(np.float32)
+c = a + b
+# d = a * c = a * (a + b)
+d = a * c
+# dd / da = d(a*a+a*b) / da = 2 * a + b
+dd_da = 2 * a + b
+# dd / db = d(a*a+a*b) / db = a
+dd_db = a
+
+graph = onnx.helper.make_graph(
+    nodes=[add_node, mul_node, gradient_node],
+    name='GradientOfTwoOperators',
+    inputs=[
+        onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                           [])],
+    outputs=[
+        onnx.helper.make_tensor_value_info('d', onnx.TensorProto.FLOAT,
+                                           []),
+        onnx.helper.make_tensor_value_info('dd_da',
+                                           onnx.TensorProto.FLOAT, []),
+        onnx.helper.make_tensor_value_info('dd_db',
+                                           onnx.TensorProto.FLOAT, [])])
+
+opsets = [
+    onnx.helper.make_operatorsetid('', 12),
+    onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+model = onnx.helper.make_model(graph,
+    producer_name='backend-test',
+    opset_imports=opsets)
+expect(model, inputs=[a, b], outputs=[d, dd_da, dd_db],
+       name='test_gradient_of_add_and_mul')
+```
+
+</details>
+
+
 ### Greater
 There are 2 test cases, listed as following:
 <details>
@@ -11290,6 +11392,9 @@ expect(node, inputs=[x, y], outputs=[z],
 ### GlobalLpPool (call for test cases)
 
 
+### GraphCall (call for test cases)
+
+
 ### If (call for test cases)
 
 
diff --git a/onnx/backend/test/case/model/gradient.py b/onnx/backend/test/case/model/gradient.py
new file mode 100644
index 00000000000..f4704f447b0
--- /dev/null
+++ b/onnx/backend/test/case/model/gradient.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np  # type: ignore
+
+import onnx
+from ..base import Base
+from . import expect
+
+
+class Gradient(Base):
+
+    @staticmethod
+    def export_gradient_scalar_add():  # type: () -> None
+        add_node = onnx.helper.make_node('Add',
+                                         ['a', 'b'], ['c'], name='my_add')
+        gradient_node = onnx.helper.make_node(
+            'Gradient', ['a', 'b'],
+            ['dc_da', 'dc_db'], name='my_gradient',
+            domain='ai.onnx.training',
+            xs=['a', 'b'], y='c')
+
+        a = np.array(1.0).astype(np.float32)
+        b = np.array(2.0).astype(np.float32)
+        c = a + b
+        # dc / da = d(a+b) / da = 1
+        dc_da = np.array(1).astype(np.float32)
+        # db / db = d(a+b) / db = 1
+        dc_db = np.array(1).astype(np.float32)
+
+        graph = onnx.helper.make_graph(
+            nodes=[add_node, gradient_node],
+            name='GradientOfAdd',
+            inputs=[
+                onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                                   []),
+                onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                                   [])],
+            outputs=[
+                onnx.helper.make_tensor_value_info('c', onnx.TensorProto.FLOAT,
+                                                   []),
+                onnx.helper.make_tensor_value_info('dc_da',
+                                                   onnx.TensorProto.FLOAT, []),
+                onnx.helper.make_tensor_value_info('dc_db',
+                                                   onnx.TensorProto.FLOAT, [])])
+        opsets = [
+            onnx.helper.make_operatorsetid('', 12),
+            onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+        model = onnx.helper.make_model(
+            graph,
+            producer_name='backend-test',
+            opset_imports=opsets)
+        expect(model, inputs=[a, b], outputs=[c, dc_da, dc_db],
+               name='test_gradient_of_add')
+
+    @staticmethod
+    def export_gradient_scalar_add_and_mul():  # type: () -> None
+        add_node = onnx.helper.make_node('Add',
+                                         ['a', 'b'], ['c'], name='my_add')
+        mul_node = onnx.helper.make_node('Mul',
+                                         ['c', 'a'], ['d'], name='my_mul')
+        gradient_node = onnx.helper.make_node(
+            'Gradient', ['a', 'b'],
+            ['dd_da', 'dd_db'], name='my_gradient',
+            domain='ai.onnx.training',
+            xs=['a', 'b'], y='d')
+
+        a = np.array(1.0).astype(np.float32)
+        b = np.array(2.0).astype(np.float32)
+        c = a + b
+        # d = a * c = a * (a + b)
+        d = a * c
+        # dd / da = d(a*a+a*b) / da = 2 * a + b
+        dd_da = 2 * a + b
+        # dd / db = d(a*a+a*b) / db = a
+        dd_db = a
+
+        graph = onnx.helper.make_graph(
+            nodes=[add_node, mul_node, gradient_node],
+            name='GradientOfTwoOperators',
+            inputs=[
+                onnx.helper.make_tensor_value_info('a', onnx.TensorProto.FLOAT,
+                                                   []),
+                onnx.helper.make_tensor_value_info('b', onnx.TensorProto.FLOAT,
+                                                   [])],
+            outputs=[
+                onnx.helper.make_tensor_value_info('d', onnx.TensorProto.FLOAT,
+                                                   []),
+                onnx.helper.make_tensor_value_info('dd_da',
+                                                   onnx.TensorProto.FLOAT, []),
+                onnx.helper.make_tensor_value_info('dd_db',
+                                                   onnx.TensorProto.FLOAT, [])])
+
+        opsets = [
+            onnx.helper.make_operatorsetid('', 12),
+            onnx.helper.make_operatorsetid('ai.onnx.training', 1)]
+        model = onnx.helper.make_model(graph,
+            producer_name='backend-test',
+            opset_imports=opsets)
+        expect(model, inputs=[a, b], outputs=[d, dd_da, dd_db],
+               name='test_gradient_of_add_and_mul')
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/model.onnx b/onnx/backend/test/data/simple/test_gradient_of_add/model.onnx
new file mode 100644
index 00000000000..9bd1af01001
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/model.onnx differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_0.pb b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..f44504237ac
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_1.pb b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_1.pb
new file mode 100644
index 00000000000..cbb94868f3f
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/input_1.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_0.pb b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..ceda7e2f64c
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_0.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_1.pb b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_1.pb
new file mode 100644
index 00000000000..01af23643f3
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_1.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_2.pb b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_2.pb
new file mode 100644
index 00000000000..d2945a7a05c
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add/test_data_set_0/output_2.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/model.onnx b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/model.onnx
new file mode 100644
index 00000000000..026821d73e4
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/model.onnx differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_0.pb b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_0.pb
new file mode 100644
index 00000000000..f44504237ac
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_0.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_1.pb b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_1.pb
new file mode 100644
index 00000000000..cbb94868f3f
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/input_1.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_0.pb b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_0.pb
new file mode 100644
index 00000000000..660fb53bb9c
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_0.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_1.pb b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_1.pb
new file mode 100644
index 00000000000..19ce55b972c
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_1.pb differ
diff --git a/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_2.pb b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_2.pb
new file mode 100644
index 00000000000..c71c2418df2
Binary files /dev/null and b/onnx/backend/test/data/simple/test_gradient_of_add_and_mul/test_data_set_0/output_2.pb differ
diff --git a/onnx/checker.cc b/onnx/checker.cc
index 0056278257a..ee830600518 100644
--- a/onnx/checker.cc
+++ b/onnx/checker.cc
@@ -534,7 +534,7 @@ void check_node(
       node.op_type(), domain_version, node.domain());
   if (!schema) {
     if (node.domain() == ONNX_DOMAIN || node.domain() == AI_ONNX_ML_DOMAIN ||
-        node.domain() == "ai.onnx") {
+        node.domain() == "ai.onnx" || node.domain() == AI_ONNX_TRAINING_DOMAIN) {
       // fail the checker if op in built-in domains has no schema
       fail_check(
           "No Op registered for " + node.op_type() +
diff --git a/onnx/common/constants.h b/onnx/common/constants.h
index 47c7a3a5092..fc2a2212280 100644
--- a/onnx/common/constants.h
+++ b/onnx/common/constants.h
@@ -10,6 +10,7 @@ namespace ONNX_NAMESPACE {
 
 // ONNX domains.
 constexpr const char* AI_ONNX_ML_DOMAIN = "ai.onnx.ml";
+constexpr const char* AI_ONNX_TRAINING_DOMAIN = "ai.onnx.training";
 constexpr const char* ONNX_DOMAIN = "";
 constexpr bool OPTIONAL = false;
 
diff --git a/onnx/defs/controlflow/defs.cc b/onnx/defs/controlflow/defs.cc
index 93648d66389..cfaaa3b31c2 100644
--- a/onnx/defs/controlflow/defs.cc
+++ b/onnx/defs/controlflow/defs.cc
@@ -809,4 +809,5 @@ ONNX_OPERATOR_SET_SCHEMA(
         .TypeConstraint("I", {"tensor(int64)"}, "Int64 tensor")
         .TypeConstraint("V", OpSchema::all_tensor_types(), "All Tensor types")
         .TypeAndShapeInferenceFunction(ScanInferenceFunction));
+
 } // namespace ONNX_NAMESPACE
diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc
index 17909a8bf31..2aa83237603 100644
--- a/onnx/defs/math/defs.cc
+++ b/onnx/defs/math/defs.cc
@@ -404,9 +404,13 @@ ONNX_OPERATOR_SET_SCHEMA(
         .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput));
 
 static const char* celu_ver12_doc = R"DOC(
-       Continuously Differentiable Exponential Linear Units:
-       Perform the linear unit element-wise on the input tensor X
-       using formula: <br/> ``` max(0,x) + min(0,alpha*(exp(x/alpha)−1)) ```
+Continuously Differentiable Exponential Linear Units:
+Perform the linear unit element-wise on the input tensor X
+using formula: 
+
+```
+max(0,x) + min(0,alpha*(exp(x/alpha)-1))
+```
 )DOC";
 
 static float celu_default_alpha = 1.0;
diff --git a/onnx/defs/operator_sets-training.h b/onnx/defs/operator_sets-training.h
new file mode 100644
index 00000000000..f654e48394f
--- /dev/null
+++ b/onnx/defs/operator_sets-training.h
@@ -0,0 +1,28 @@
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+#pragma once
+
+#include "onnx/defs/schema.h"
+
+namespace ONNX_NAMESPACE {
+
+// Declare training operators.
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(OnnxTraining, 1, Gradient);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(OnnxTraining, 1, GraphCall);
+
+// Iterate over schema from ai.onnx.training version 1
+class OpSet_OnnxTraining_ver1 {
+ public:
+  static void ForEachSchema(std::function<void(OpSchema&&)> fn) {
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(OnnxTraining, 1, Gradient)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(OnnxTraining, 1, GraphCall)>());
+  }
+};
+
+// Register training operators.
+inline void RegisterOnnxTrainingOperatorSetSchema() {
+  RegisterOpSetSchema<OpSet_OnnxTraining_ver1>();
+}
+
+} // namespace ONNX_NAMESPACE
\ No newline at end of file
diff --git a/onnx/defs/operator_sets.h b/onnx/defs/operator_sets.h
index f95ccf9776f..10e5dfd32d4 100644
--- a/onnx/defs/operator_sets.h
+++ b/onnx/defs/operator_sets.h
@@ -568,7 +568,8 @@ class OpSet_Onnx_ver10 {
            Onnx, 10, NonMaxSuppression)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(
            Onnx, 10, ReverseSequence)>());
-    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Onnx, 10, RoiAlign)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(
+           Onnx, 10, RoiAlign)>());
   }
 };
 
@@ -740,7 +741,7 @@ class OpSet_Onnx_ver12 {
   }
 };
 
-  inline void RegisterOnnxOperatorSetSchema() {
+inline void RegisterOnnxOperatorSetSchema() {
   RegisterOpSetSchema<OpSet_Onnx_ver1>();
   RegisterOpSetSchema<OpSet_Onnx_ver2>();
   RegisterOpSetSchema<OpSet_Onnx_ver3>();
diff --git a/onnx/defs/schema.cc b/onnx/defs/schema.cc
index 96101fccbfd..01d0829b23e 100644
--- a/onnx/defs/schema.cc
+++ b/onnx/defs/schema.cc
@@ -6,6 +6,7 @@
 #include <unordered_set>
 #include "onnx/checker.h"
 #include "onnx/defs/operator_sets.h"
+#include "onnx/defs/operator_sets-training.h"
 
 #ifdef ONNX_ML
 #include "onnx/defs/operator_sets-ml.h"
@@ -950,6 +951,9 @@ OpName_Domain_Version_Schema_Map& OpSchemaRegistry::map() {
       RegisterOnnxMLOperatorSetSchema();
 #endif
 
+      // Invoke register of training operators.
+      RegisterOnnxTrainingOperatorSetSchema();
+
 #ifndef NDEBUG
       size_t dbg_registered_schema_count =
           GetRegisteredSchemaCount() - dbg_initial_schema_count;
diff --git a/onnx/defs/schema.h b/onnx/defs/schema.h
index afe0349721f..3a2dec8c20e 100644
--- a/onnx/defs/schema.h
+++ b/onnx/defs/schema.h
@@ -745,6 +745,7 @@ class OpSchemaRegistry final : public ISchemaRegistry {
       // determined to remove too old version history.
       map_[ONNX_DOMAIN] = std::make_pair(1, 12);
       map_[AI_ONNX_ML_DOMAIN] = std::make_pair(1, 2);
+      map_[AI_ONNX_TRAINING_DOMAIN] = std::make_pair(1, 1);
     }
 
     const std::unordered_map<std::string, std::pair<int, int>>& Map() const {
@@ -944,6 +945,9 @@ OpSchema GetOpSchema();
 #define ONNX_ML_OPERATOR_SET_SCHEMA(name, ver, impl) \
   ONNX_OPERATOR_SET_SCHEMA_EX(name, OnnxML, AI_ONNX_ML_DOMAIN, ver, true, impl)
 
+#define ONNX_TRAINING_OPERATOR_SET_SCHEMA(name, ver, impl) \
+  ONNX_OPERATOR_SET_SCHEMA_EX(name, OnnxTraining, AI_ONNX_TRAINING_DOMAIN, ver, true, impl)
+
 // Defines specialization of GetOpSchema for a class whose name is determined
 // based on a convention using name, domain, and version.  Operator schema are
 // normally included in operator sets and registered in OpSchemaRegistry::map().
diff --git a/onnx/defs/training/defs.cc b/onnx/defs/training/defs.cc
new file mode 100644
index 00000000000..19febf7eaec
--- /dev/null
+++ b/onnx/defs/training/defs.cc
@@ -0,0 +1,327 @@
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the MIT license.
+
+#include <algorithm>
+#include <cmath>
+#include "onnx/defs/function.h"
+#include "onnx/defs/schema.h"
+
+namespace ONNX_NAMESPACE {
+
+static const char* Gradient_ver1_doc = R"DOC(
+Gradient operator computes the partial derivatives of a specific tensor w.r.t.
+some other tensors. This operator is widely used in gradient-based training
+algorithms. To illustrate its use, let's consider a computation graph,
+
+```
+X -----.
+       |
+       v
+W --> Conv --> H --> Gemm --> Y
+                      ^
+                      |
+                      Z
+```
+
+, where W and Z are trainable tensors. Note that operators' attributes are
+omitted for the sake of simplicity. Let dY/dW (dY/dZ) be the gradient of
+Y with respect to W (Z). The user can compute gradient by inserting Gradient
+operator to form another graph shown below.
+
+```
+W --> Conv --> H --> Gemm --> Y
+|      ^              ^
+|      |              |
+|      X              Z
+|      |              |
+|      |   .----------'
+|      |   |  (W/Z/X is the 1st/2nd/3rd input of Gradient as shown in
+|      |   |   "xs" followed by "zs")
+|      v   v
+'---> Gradient(xs=["W", "Z"], zs=["X"], y="Y")
+       |   |
+       |   '-----------------------------------> dY/dW (1st output of Gradient)
+       |
+       '---------------------------------------> dY/dZ (2nd output of Gradient)
+```
+
+By definition, the tensor "y" is a function of independent variables in "xs"
+and "zs". Since we only compute the gradient of "y" w.r.t. the differentiable
+variables in "xs", this Gradient only outputs dY/dW and dY/dZ. Note that "H"
+cannot appear in "xs" and "zs". The reason is that "H" can be determined by
+tensors "W" and "X" and therefore "H" is not an independent variable.
+
+All outputs are optional. If needed, for example, user can assign an empty
+string to the 1st output name of that Gradient to skip the generation of dY/dW.
+Note that the concept of optional outputs can also be found in ONNX's RNN, GRU,
+and LSTM.
+
+Gradient operator can compute derivative against intermediate tensors. For
+example, the gradient of Y with respect to H can be done via
+
+```
+W --> Conv --> H --> Gemm --> Y
+       ^       |      ^
+       |       |      |
+       X       |      Z
+       .-------'      |
+       |   .----------'
+       |   | (H/Z is the 1st/2nd input of Gradient as shown in "xs")
+       v   v
+      Gradient(xs=["H", "Z"], y="Y")
+       |   |
+       |   '-----------------------------------> dY/dH (1st output of Gradient)
+       |
+       '---------------------------------------> dY/dZ (2nd output of Gradient)
+```
+
+It is possible to represent high-order differentiation using Gradient operators.
+For example, given the following linear model:
+
+```
+W --> Gemm --> Y --> Loss --> O
+       ^              ^
+       |              |
+       X              L
+```
+
+To compute the 2nd order derivative of O with respect to W (denoted by
+d^2O/dW^2), one can do
+
+```
+W --> Gemm --> Y --> Loss --> O
+|      ^              ^
+|      |              |
+|      X .------------L
+|      | |            |
+|      | |            v
++------+-+> Gradient(xs=["X", "W"], zs=["L"], y="O") ---> dO/dX (1st output of Gradient)
+|      | |    |
+|      | |    '---> dO/dW (2nd output of Gradient)
+|      v v
+'---> Gradient(xs=["X", "W"], zs=["L"], y="dO/dW") ---> d(dO/dW)dX (1st output of
+       |                                                  Gradient)
+       |
+       |
+       '---> d^2O/dW^2 (2nd output of Gradient)
+```
+
+The tensors named in attributes "xs", "zs", and "y" define the differentiated
+computation graph, and the inputs to Gradient node define the values at
+which the gradient is computed. We can feed different tensors to the identified
+graph. For example, one can compute the gradient of Y with respect to H at 
+a specific value of H, H_1, by providing that value as an input to the Gradient
+node.
+
+```
+W --> Conv --> H --> Gemm --> Y
+       ^              ^
+       |              |
+       X              Z
+
+          Z_1 (2nd input of Gradient)
+           |
+           v
+H_1 --> Gradient(xs=["H", "Z"], y="Y") ---> dY/dH when H = H_1 and Y = Y_1.
+           |
+           '------------------------------> dY/dZ (2nd output of Gradient)
+```
+
+When the inputs of Gradient are the tensors named in "xs" and "zs", the
+computation can be optimized. More specifically, intermediate variables in
+forward pass can be reused if the gradient is computed via reverse-mode
+auto-differentiation.
+
+)DOC";
+
+ONNX_TRAINING_OPERATOR_SET_SCHEMA(
+    Gradient,
+    1,
+    OpSchema()
+        .SetDoc(Gradient_ver1_doc)
+        .Input(
+            0,
+            "Inputs",
+            "The values fed into graph identified by the attributes. "
+            "The i-th input is the value of the i-th tensor specified in the "
+            "concatenated list of the attribute \"xs\" and the attribute "
+            " \"zs\". For example, if xs=[\"A\", \"B\"] and zs=[\"C\"], the "
+            "first input is used as the value of symbol \"A\" and the 3rd "
+            "input is substituted for all the occurrences of \"C\".",
+            "T1",
+            OpSchema::Variadic,
+            false)
+        .Output(
+            0,
+            "Outputs",
+            "The gradient of the tensor specified by the attribute \"y\" "
+            "with respect to each of tensors specified in the "
+            "attribute \"xs\". The i-th output is the gradient of \"y\" with "
+            "respect to the i-th tensor specified in the attribute \"xs\".",
+            "T2",
+            OpSchema::Variadic,
+            false)
+        .Attr(
+            "xs",
+            "Input tensor names of the differentiated sub-graph. It "
+            "contains only the necessary differentiated "
+            "inputs of a (sub-)graph. Variables (usually called "
+            "intermediate variables) that can be generated from inputs "
+            "cannot be included in this attribute.",
+            AttributeProto::STRINGS)
+        .Attr(
+            "zs",
+            "Input tensor names of the differentiated sub-graph. It "
+            "contains only the necessary non-differentiated "
+            "inputs of a (sub-)graph. Variables (usually called "
+            "intermediate variables) that can be generated from inputs "
+            "cannot be included in this attribute.",
+            AttributeProto::STRINGS,
+            OPTIONAL)
+        .Attr(
+            "y",
+            "The targeted tensor. It can be viewed as the output of the "
+            "differentiated function. The attribute \"xs\" and attribute "
+            "\"zs\" are the minimal independent variable set that determines "
+            "the value of \"y\".",
+            AttributeProto::STRING)
+        .TypeConstraint(
+            "T1",
+            OpSchema::all_tensor_types(),
+            "Allow outputs to be any kind of tensor.")
+        .TypeConstraint(
+            "T2",
+            {"tensor(float16)",
+             "tensor(float)",
+             "tensor(double)"},
+            "Allow inputs to be any kind of floating-point tensor."));
+
+
+static const char* GraphCall_ver1_doc = R"DOC(
+The GraphCall operator invokes a graph inside TrainingInfoProto's
+algorithm field. The GraphCall inputs and outputs are bound to those of
+invoked graph by position. If a graph input has an initializer, that input
+is considered optional. All graph outputs are optional.
+
+Below Python syntax is used for describing dictionary and list.
+
+Assume that ModelProto's graph field has
+- name: "MyInferenceGraph"
+- input: ["X", "W", "Z"]
+- initializer: [W]
+- output: ["Y"]
+
+as visualized below for inference.
+
+```
+X -----.
+       |
+       v
+W --> Conv --> H --> Gemm --> Y
+                      ^
+                      |
+                      Z
+```
+
+Assume that the training algorithm contains
+
+- inputs: ["X_1", "Z_1", "C"]
+- initializer: [T]
+- outputs: ["W_new"]
+
+with a dictionary
+
+- update_binding: {"W": "W_new", "T": "T_new"}
+
+Inside the training algorithm graph, one can invoke the inference
+graph via adding a GraphCall node with
+
+- inputs: ["X_1", "W", Z_1"]
+- outputs: ["Y_1"]
+- an attribute graph_name="MyInferenceGraph",
+
+The initializers, "W" and "T" in this case, in update_binding
+are considered globally-visible and mutable variables, which
+can be used as inputs of operators in the training graph.
+
+An example training algorithm graph may look like
+
+```
+.-------- W (a global and mutable variable from
+|         |  the inference graph)
+|         |
+|   .-----'-----------.
+|   |                 |
+|   |                 v
+|   | .-- X_1 --> GraphCall(graph_name="MyInferenceGraph")
+|   | |            |  |
+|   | |            |  |
+|   | |   Z_1 -----'  |
+|   | |    |          V
+|   | |    |         Y_1 ---> Loss ---> O
+|   | |    |                    ^
+|   | |    |                    |
+|   | `--. |                    C
+|   |    | |                    |
+|   |    | |   .----------------'
+|   |    | |   |
+|   |    v v   v
+|   `--> Gradient(xs=["W"], zs=["X_1", "Z_1", "C"], y="O")
+|        |
+|        v
+|      dO_dW (gradient of W)      1 (a scalar one)
+|        |                        |
+|        V                        v
+|       Div <--- T ------------> Add ---> T_new
+|        |    (T is the number of training iterations.
+|        |     T is also globally visible and mutable.)
+|        v
+`-----> Sub ----> W_new
+```
+
+where Loss is a dummy node which computes the minimized objective function.
+
+The variable "W" is an optional input in the called graph.
+If the user omits it, the input list of GraphCall becomes ["X_1", "", "Z_1"].
+In this case, from the view of computation graph, the Conv operator invoked by
+GraphCall's may be still connected the global "W" variable and therefore the
+structure of the computation graph is unchanged.
+)DOC";
+
+ONNX_TRAINING_OPERATOR_SET_SCHEMA(
+    GraphCall,
+    1,
+    OpSchema()
+        .SetDoc(GraphCall_ver1_doc)
+        .Input(
+            0,
+            "Inputs",
+            "Inputs fed to the invoked graph. "
+            "The i-th input here goes to the i-th input of the invoked graph. "
+            "To omit an optional input in this field, "
+            "the user can drop it or use an empty string.",
+            "T",
+            OpSchema::Variadic,
+            false)
+        .Output(
+            0,
+            "Outputs",
+            "The outputs generated by the called graph. "
+            "Its i-th value is bound to the i-th output of the called graph. "
+            "Similar to the inputs, all outputs are optional.",
+            "T",
+            OpSchema::Variadic,
+            false)
+        .Attr(
+            "graph_name",
+            "The invoked graph's name. "
+            "The only allowed value is the name of the inference graph, "
+            "which is stored in \"ModelProto.graph.name\" "
+            "in the ONNX model format.",
+            AttributeProto::STRING)
+        .TypeConstraint(
+            "T",
+            OpSchema::all_tensor_types(),
+            "Allow inputs and outputs to be any kind of tensor."));
+
+} // namespace ONNX_NAMESPACE
diff --git a/onnx/examples/make_model.ipynb b/onnx/examples/make_model.ipynb
index 5517aaec2d4..dc64db4375d 100644
--- a/onnx/examples/make_model.ipynb
+++ b/onnx/examples/make_model.ipynb
@@ -9,7 +9,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The ir_version in model: 6\n",
+      "The ir_version in model: 7\n",
       "\n",
       "The producer_name in model: onnx-example\n",
       "\n",
diff --git a/onnx/onnx-ml.proto b/onnx/onnx-ml.proto
index dc2adee4950..9a2c6c1cf7e 100644
--- a/onnx/onnx-ml.proto
+++ b/onnx/onnx-ml.proto
@@ -92,7 +92,17 @@ enum Version {
   // - Add support for sparse tensor constants stored in model.
   //   - Add message SparseTensorProto
   //   - Add sparse initializers
-  IR_VERSION = 0x0000000000000006;
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Make inference graph callable from TrainingInfoProto via GraphCall operator.
+  IR_VERSION = 0x0000000000000007;
 }
 
 // Attributes
@@ -199,12 +209,119 @@ message NodeProto {
   optional string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been consumed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update stages (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each stage.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this graph contains loss node, gradient node,
+  // optimizer node, increment of iteration count, and some calls to the inference
+  // graph.
+  //
+  // The field algorithm.node is the only place the user can use GraphCall
+  // operator. The only callable graph is the one stored in ModelProto.graph.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable and globally-visible variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two global
+  //     variables may not have the same name. This ensures that one
+  //     global variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm".
+  //  4. If an optional input of a graph is omitted when using GraphCall, the
+  //     global variable with the same name may be used.
+  //  5. When using GraphCall, the users always can pass values to optional 
+  //     inputs of the called graph even if the associated initializers appears
+  //     as keys in "update_binding"s.
+  //  6. The graphs in TrainingInfoProto's can use global variables as
+  //     their operator inputs.
+  //  7. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -249,6 +366,17 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
@@ -594,6 +722,7 @@ message OperatorSetIdProto {
   optional int64 version = 2;
 }
 
+
 // For using protobuf-lite
 option optimize_for = LITE_RUNTIME;
 
diff --git a/onnx/onnx-ml.proto3 b/onnx/onnx-ml.proto3
index ba2670ea9fa..2a24c4a6ac3 100644
--- a/onnx/onnx-ml.proto3
+++ b/onnx/onnx-ml.proto3
@@ -92,7 +92,17 @@ enum Version {
   // - Add support for sparse tensor constants stored in model.
   //   - Add message SparseTensorProto
   //   - Add sparse initializers
-  IR_VERSION = 0x0000000000000006;
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Make inference graph callable from TrainingInfoProto via GraphCall operator.
+  IR_VERSION = 0x0000000000000007;
 }
 
 // Attributes
@@ -199,12 +209,119 @@ message NodeProto {
   string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been consumed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update stages (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each stage.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this graph contains loss node, gradient node,
+  // optimizer node, increment of iteration count, and some calls to the inference
+  // graph.
+  //
+  // The field algorithm.node is the only place the user can use GraphCall
+  // operator. The only callable graph is the one stored in ModelProto.graph.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable and globally-visible variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two global
+  //     variables may not have the same name. This ensures that one
+  //     global variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm".
+  //  4. If an optional input of a graph is omitted when using GraphCall, the
+  //     global variable with the same name may be used.
+  //  5. When using GraphCall, the users always can pass values to optional 
+  //     inputs of the called graph even if the associated initializers appears
+  //     as keys in "update_binding"s.
+  //  6. The graphs in TrainingInfoProto's can use global variables as
+  //     their operator inputs.
+  //  7. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -249,6 +366,17 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
@@ -594,6 +722,7 @@ message OperatorSetIdProto {
   int64 version = 2;
 }
 
+
 // For using protobuf-lite
 option optimize_for = LITE_RUNTIME;
 
diff --git a/onnx/onnx-operators-ml.proto b/onnx/onnx-operators-ml.proto
index 67d229ea6c0..6ad6a254538 100644
--- a/onnx/onnx-operators-ml.proto
+++ b/onnx/onnx-operators-ml.proto
@@ -96,7 +96,6 @@ message FunctionProto {
 //      this operator was initially declared in.
 //
 message OperatorProto {
-
   // The name of the operator within a domain.
   // This field MUST be present in this version of the IR.
   optional string op_type = 1;
@@ -181,6 +180,7 @@ message OperatorSetProto {
   repeated FunctionProto functions = 9;
 }
 
+
 // For using protobuf-lite
 option optimize_for = LITE_RUNTIME;
 
diff --git a/onnx/onnx-operators-ml.proto3 b/onnx/onnx-operators-ml.proto3
index 892a6e4ef11..f9a1e950e97 100644
--- a/onnx/onnx-operators-ml.proto3
+++ b/onnx/onnx-operators-ml.proto3
@@ -96,7 +96,6 @@ message FunctionProto {
 //      this operator was initially declared in.
 //
 message OperatorProto {
-
   // The name of the operator within a domain.
   // This field MUST be present in this version of the IR.
   string op_type = 1;
@@ -181,6 +180,7 @@ message OperatorSetProto {
   repeated FunctionProto functions = 9;
 }
 
+
 // For using protobuf-lite
 option optimize_for = LITE_RUNTIME;
 
diff --git a/onnx/onnx-operators.in.proto b/onnx/onnx-operators.in.proto
index 90effddf09c..923fac9205e 100644
--- a/onnx/onnx-operators.in.proto
+++ b/onnx/onnx-operators.in.proto
@@ -95,7 +95,6 @@ message FunctionProto {
 //      this operator was initially declared in.
 //
 message OperatorProto {
-
   // The name of the operator within a domain.
   // This field MUST be present in this version of the IR.
   optional string op_type = 1;
@@ -179,3 +178,4 @@ message OperatorSetProto {
   // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
   repeated FunctionProto functions = 9;
 }
+
diff --git a/onnx/onnx-operators.proto b/onnx/onnx-operators.proto
index a7a566cf998..ac33b92531d 100644
--- a/onnx/onnx-operators.proto
+++ b/onnx/onnx-operators.proto
@@ -96,7 +96,6 @@ message FunctionProto {
 //      this operator was initially declared in.
 //
 message OperatorProto {
-
   // The name of the operator within a domain.
   // This field MUST be present in this version of the IR.
   optional string op_type = 1;
@@ -179,4 +178,9 @@ message OperatorSetProto {
   // The functions specified by this operator set.
   // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
   repeated FunctionProto functions = 9;
-}
\ No newline at end of file
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/onnx/onnx-operators.proto3 b/onnx/onnx-operators.proto3
index 859f9de0013..42d9e0523ee 100644
--- a/onnx/onnx-operators.proto3
+++ b/onnx/onnx-operators.proto3
@@ -96,7 +96,6 @@ message FunctionProto {
 //      this operator was initially declared in.
 //
 message OperatorProto {
-
   // The name of the operator within a domain.
   // This field MUST be present in this version of the IR.
   string op_type = 1;
@@ -179,4 +178,9 @@ message OperatorSetProto {
   // The functions specified by this operator set.
   // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
   repeated FunctionProto functions = 9;
-}
\ No newline at end of file
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/onnx/onnx.in.proto b/onnx/onnx.in.proto
index 2cc16494c30..3be3886c550 100644
--- a/onnx/onnx.in.proto
+++ b/onnx/onnx.in.proto
@@ -89,7 +89,17 @@ enum Version {
   // - Add support for sparse tensor constants stored in model.
   //   - Add message SparseTensorProto
   //   - Add sparse initializers
-  IR_VERSION = 0x0000000000000006;
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Make inference graph callable from TrainingInfoProto via GraphCall operator.
+  IR_VERSION = 0x0000000000000007;
 }
 
 // Attributes
@@ -196,12 +206,119 @@ message NodeProto {
   optional string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been consumed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update stages (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each stage.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this graph contains loss node, gradient node,
+  // optimizer node, increment of iteration count, and some calls to the inference
+  // graph.
+  //
+  // The field algorithm.node is the only place the user can use GraphCall
+  // operator. The only callable graph is the one stored in ModelProto.graph.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable and globally-visible variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two global
+  //     variables may not have the same name. This ensures that one
+  //     global variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm".
+  //  4. If an optional input of a graph is omitted when using GraphCall, the
+  //     global variable with the same name may be used.
+  //  5. When using GraphCall, the users always can pass values to optional 
+  //     inputs of the called graph even if the associated initializers appears
+  //     as keys in "update_binding"s.
+  //  6. The graphs in TrainingInfoProto's can use global variables as
+  //     their operator inputs.
+  //  7. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -246,6 +363,17 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
@@ -594,3 +722,4 @@ message OperatorSetIdProto {
   // This field MUST be present in this version of the IR.
   optional int64 version = 2;
 }
+
diff --git a/onnx/onnx.proto b/onnx/onnx.proto
index f259410aeaf..5688f725ae1 100644
--- a/onnx/onnx.proto
+++ b/onnx/onnx.proto
@@ -90,7 +90,17 @@ enum Version {
   // - Add support for sparse tensor constants stored in model.
   //   - Add message SparseTensorProto
   //   - Add sparse initializers
-  IR_VERSION = 0x0000000000000006;
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Make inference graph callable from TrainingInfoProto via GraphCall operator.
+  IR_VERSION = 0x0000000000000007;
 }
 
 // Attributes
@@ -197,12 +207,119 @@ message NodeProto {
   optional string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been consumed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update stages (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each stage.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this graph contains loss node, gradient node,
+  // optimizer node, increment of iteration count, and some calls to the inference
+  // graph.
+  //
+  // The field algorithm.node is the only place the user can use GraphCall
+  // operator. The only callable graph is the one stored in ModelProto.graph.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  optional GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable and globally-visible variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two global
+  //     variables may not have the same name. This ensures that one
+  //     global variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm".
+  //  4. If an optional input of a graph is omitted when using GraphCall, the
+  //     global variable with the same name may be used.
+  //  5. When using GraphCall, the users always can pass values to optional 
+  //     inputs of the called graph even if the associated initializers appears
+  //     as keys in "update_binding"s.
+  //  6. The graphs in TrainingInfoProto's can use global variables as
+  //     their operator inputs.
+  //  7. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -247,6 +364,17 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
@@ -566,4 +694,9 @@ message OperatorSetIdProto {
   // The version of the operator set being identified.
   // This field MUST be present in this version of the IR.
   optional int64 version = 2;
-}
\ No newline at end of file
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/onnx/onnx.proto3 b/onnx/onnx.proto3
index 34e335725e2..f20a023c74b 100644
--- a/onnx/onnx.proto3
+++ b/onnx/onnx.proto3
@@ -90,7 +90,17 @@ enum Version {
   // - Add support for sparse tensor constants stored in model.
   //   - Add message SparseTensorProto
   //   - Add sparse initializers
-  IR_VERSION = 0x0000000000000006;
+  IR_VERSION_2019_9_19 = 0x0000000000000006;
+
+  // IR VERSION 7 published on <TBD>
+  // - Add a list to promote inference graph's initializers to global and
+  //   mutable variables. Global variables are visible in all graphs of the
+  //   stored models.
+  // - Add message TrainingInfoProto to store initialization
+  //   method and training algorithm. The execution of TrainingInfoProto
+  //   can modify the values of mutable variables.
+  // - Make inference graph callable from TrainingInfoProto via GraphCall operator.
+  IR_VERSION = 0x0000000000000007;
 }
 
 // Attributes
@@ -197,12 +207,119 @@ message NodeProto {
   string doc_string = 6;
 }
 
+// Training information
+// TrainingInfoProto stores information for training a model.
+// In particular, this defines two functionalities: an initialization-step
+// and a training-algorithm-step. Initialization resets the model
+// back to its original state as if no training has been consumed.
+// Training algorithm improves the model based on input data.
+//
+// The semantics of the initialization-step is that the initializers
+// in ModelProto.graph and in TrainingInfoProto.algorithm are first
+// initialized as specified by the initializers in the graph, and then
+// updated by the "initialization_binding" in every instance in
+// ModelProto.training_info.
+//
+// The field "algorithm" defines a computation graph which represents a
+// training algorithm's step. After the execution of a
+// TrainingInfoProto.algorithm, the initializers specified by "update_binding"
+// may be immediately updated. If the targeted training algorithm contains
+// consecutive update stages (such as block coordinate descent methods),
+// the user needs to create a TrainingInfoProto for each stage.
+message TrainingInfoProto {
+  // This field describes a graph to compute the initial tensors
+  // upon starting the training process. Initialization graph has no input
+  // and can have multiple outputs. Usually, trainable tensors in neural
+  // networks are randomly initialized. To achieve that, for each tensor,
+  // the user can put a random number operator such as RandomNormal or
+  // RandomUniform in TrainingInfoProto.initialization.node and assign its
+  // random output to the specific tensor using "initialization_binding".
+  // This graph can also set the initializers in "algorithm" in the same
+  // TrainingInfoProto; a use case is resetting the number of training
+  // iteration to zero.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  GraphProto initialization = 1;
+
+  // This field represents a training algorithm step. Given required inputs,
+  // it computes outputs to update initializers in its own or inference graph's
+  // initializer lists. In general, this graph contains loss node, gradient node,
+  // optimizer node, increment of iteration count, and some calls to the inference
+  // graph.
+  //
+  // The field algorithm.node is the only place the user can use GraphCall
+  // operator. The only callable graph is the one stored in ModelProto.graph.
+  //
+  // By default, this field is an empty graph and its evaluation does not
+  // produce any output.
+  GraphProto algorithm = 2;
+
+  // This field specifies the bindings from the outputs of "initialization" to
+  // some initializers in "ModelProto.graph.initializer" and 
+  // the "algorithm.initializer" in the same TrainingInfoProto.
+  // See "update_binding" below for details.
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "initialization".
+  repeated StringStringEntryProto initialization_binding = 3;
+
+  // Gradient-based training is usually an iterative procedure. In one gradient
+  // descent iteration, we apply
+  //
+  // x = x - r * g
+  //
+  // where "x" is the optimized tensor, "r" stands for learning rate, and "g" is
+  // gradient of "x" with respect to a chosen loss. To avoid adding assignments
+  // into the training graph, we split the update equation into
+  //
+  // y = x - r * g
+  // x = y
+  //
+  // The user needs to save "y = x - r * g" into TrainingInfoProto.algorithm. To
+  // tell that "y" should be assigned to "x", the field "update_binding" may
+  // contain a key-value pair of strings, "x" (key of StringStringEntryProto)
+  // and "y" (value of StringStringEntryProto).
+  // For a neural network with multiple trainable (mutable) tensors, there can
+  // be multiple key-value pairs in "update_binding".
+  //
+  // The initializers appears as keys in "update_binding" are considered
+  // mutable and globally-visible variables. This implies some behaviors
+  // as described below.
+  //
+  //  1. We have only unique keys in all "update_binding"s so that two global
+  //     variables may not have the same name. This ensures that one
+  //     global variable is assigned up to once.
+  //  2. The keys must appear in names of "ModelProto.graph.initializer" or
+  //     "TrainingInfoProto.algorithm.initializer".
+  //  3. The values must be output names of "algorithm".
+  //  4. If an optional input of a graph is omitted when using GraphCall, the
+  //     global variable with the same name may be used.
+  //  5. When using GraphCall, the users always can pass values to optional 
+  //     inputs of the called graph even if the associated initializers appears
+  //     as keys in "update_binding"s.
+  //  6. The graphs in TrainingInfoProto's can use global variables as
+  //     their operator inputs.
+  //  7. Mutable variables are initialized to the value specified by the
+  //     corresponding initializer, and then potentially updated by
+  //     "initializer_binding"s and "update_binding"s in "TrainingInfoProto"s.
+  //
+  // This field usually contains names of trainable tensors
+  // (in ModelProto.graph), optimizer states such as momentums in advanced
+  // stochastic gradient methods (in TrainingInfoProto.graph),
+  // and number of training iterations (in TrainingInfoProto.graph).
+  //
+  // By default, this field is empty and no initializer would be changed
+  // by the execution of "algorithm".
+  repeated StringStringEntryProto update_binding = 4;
+}
+
 // Models
 //
 // ModelProto is a top-level file/container format for bundling a ML model and
 // associating its computation graph with metadata.
 //
-// The semantics of the model are described by the associated GraphProto.
+// The semantics of the model are described by the associated GraphProto's.
 message ModelProto {
   // The version of the IR this model targets. See Version enum above.
   // This field MUST be present.
@@ -247,6 +364,17 @@ message ModelProto {
 
   // Named metadata values; keys should be distinct.
   repeated StringStringEntryProto metadata_props = 14;
+
+  // Training-specific information. Sequentially executing all stored
+  // `TrainingInfoProto.algorithm`s and assigning their outputs following
+  // the corresponding `TrainingInfoProto.update_binding`s is one training
+  // iteration. Similarly, to initialize the model
+  // (as if training hasn't happened), the user should sequentially execute
+  // all stored `TrainingInfoProto.initialization`s and assigns their outputs
+  // using `TrainingInfoProto.initialization_binding`s.
+  //
+  // If this field is empty, the training behavior of the model is undefined.
+  repeated TrainingInfoProto training_info = 20;
 };
 
 // StringStringEntryProto follows the pattern for cross-proto-version maps.
@@ -566,4 +694,9 @@ message OperatorSetIdProto {
   // The version of the operator set being identified.
   // This field MUST be present in this version of the IR.
   int64 version = 2;
-}
\ No newline at end of file
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/update_doc.bat b/tools/update_doc.bat
new file mode 100644
index 00000000000..029731ade24
--- /dev/null
+++ b/tools/update_doc.bat
@@ -0,0 +1,18 @@
+:: Run this script from ONNX root directory under Anaconda.
+set CMAKE_ARGS="-DONNX_USE_PROTOBUF_SHARED_LIBS=ON"
+set ONNX_ML=1
+
+python onnx\gen_proto.py -l
+
+python onnx\gen_proto.py -l --ml
+
+python setup.py develop
+
+python onnx\backend\test\cmd_tools.py generate-data
+
+python onnx\backend\test\stat_coverage.py
+
+python onnx\defs\gen_doc.py
+set ONNX_ML=0
+python onnx\defs\gen_doc.py
+set ONNX_ML=1
\ No newline at end of file