From e9a978e706620037c5ab2855ee1b2db5e73d7769 Mon Sep 17 00:00:00 2001
From: niuyazhe <niuyazhe@sensetime.com>
Date: Wed, 1 Nov 2023 14:13:58 +0800
Subject: [PATCH] polish(nyz): polish api doc comments problems

---
 ding/envs/env/ding_env_wrapper.py         |  14 +-
 ding/envs/env_manager/base_env_manager.py |   6 +-
 ding/envs/env_wrappers/env_wrappers.py    |   2 +
 ding/model/common/head.py                 |   1 +
 ding/model/common/utils.py                |   1 +
 ding/model/template/__init__.py           |   1 +
 ding/model/template/acer.py               |  33 +---
 ding/model/template/maqac.py              | 199 ++++------------------
 ding/model/template/mavac.py              |   9 +-
 ding/model/template/vae.py                |  17 +-
 ding/policy/mbpolicy/mbsac.py             |   1 -
 ding/reward_model/pwil_irl_model.py       |   5 +-
 ding/rl_utils/td.py                       |   2 -
 13 files changed, 68 insertions(+), 223 deletions(-)

diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py
index 83fab06048..dc67e826bd 100644
--- a/ding/envs/env/ding_env_wrapper.py
+++ b/ding/envs/env/ding_env_wrapper.py
@@ -26,16 +26,16 @@ class DingEnvWrapper(BaseEnv):
     def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, caller: str = 'collector') -> None:
         """
         Overview:
-            Initialize the DingEnvWrapper. Either an environment instance or a config to create the environment
-            instance should be passed in:
-                - An environment instance: The `env` parameter must not be `None`, but should be the instance.
-                  It does not support subprocess environment manager. Thus, it is usually used in simple environments.
-                - A config to create an environment instance: The `cfg` parameter must contain `env_id`.
+            Initialize the DingEnvWrapper. Either an environment instance or a config to create the environment \
+            instance should be passed in. For the former, i.e., an environment instance: The `env` parameter must not \
+            be `None`, but should be the instance. It does not support subprocess environment manager. Thus, it is \
+            usually used in simple environments. For the latter, i.e., a config to create an environment instance: \
+            The `cfg` parameter must contain `env_id`.
         Arguments:
             - env (:obj:`gym.Env`): An environment instance to be wrapped.
             - cfg (:obj:`dict`): The configuration dictionary to create an environment instance.
             - seed_api (:obj:`bool`): Whether to use seed API. Defaults to True.
-            - caller (:obj:`str`): A string representing the caller of this method, including ``collector`` or
+            - caller (:obj:`str`): A string representing the caller of this method, including ``collector`` or \
                 ``evaluator``. Different caller may need different wrappers. Default is 'collector'.
         """
         self._env = None
@@ -44,7 +44,7 @@ def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True,
         self._seed_api = seed_api  # some env may disable `env.seed` api
         self._caller = caller
         if self._cfg is None:
-            self._cfg = dict()
+            self._cfg = {}
         self._cfg = EasyDict(self._cfg)
         if 'act_scale' not in self._cfg:
             self._cfg.act_scale = False
diff --git a/ding/envs/env_manager/base_env_manager.py b/ding/envs/env_manager/base_env_manager.py
index 96aa43f817..529d5b0225 100644
--- a/ding/envs/env_manager/base_env_manager.py
+++ b/ding/envs/env_manager/base_env_manager.py
@@ -562,6 +562,9 @@ def closed(self) -> bool:
         """
         return self._closed
 
+    def random_action(self) -> Dict:
+        return {env_id: self._env_ref.action_space.sample() for env_id in self.ready_obs_id}
+
 
 @ENV_MANAGER_REGISTRY.register('base_v2')
 class BaseEnvManagerV2(BaseEnvManager):
@@ -577,7 +580,8 @@ class BaseEnvManagerV2(BaseEnvManager):
 
     .. note::
         For more details about new task pipeline, please refer to the system document of DI-engine \
-        (`en link <../03_system/index.html>`_).
+        (`system en link <../03_system/index.html>`_).
+
     Interfaces:
         reset, step, seed, close, enable_save_replay, launch, default_config, reward_shaping, enable_save_figure
     Properties:
diff --git a/ding/envs/env_wrappers/env_wrappers.py b/ding/envs/env_wrappers/env_wrappers.py
index 76c0880d59..f62de83352 100644
--- a/ding/envs/env_wrappers/env_wrappers.py
+++ b/ding/envs/env_wrappers/env_wrappers.py
@@ -39,6 +39,8 @@
 - GymToGymnasiumWrapper: Adapts environments from the Gym library to be compatible with the Gymnasium library.
 - AllinObsWrapper: Consolidates all information into the observation, useful for environments where the agent's
     observation should include additional information such as the current score or time remaining.
+- ObsPlusPrevActRewWrapper: This wrapper is used in policy NGU. It sets a dict as the new wrapped observation,
+    which includes the current observation, previous action and previous reward.
 """
 
 import copy
diff --git a/ding/model/common/head.py b/ding/model/common/head.py
index 30f5b58d98..99e94a85b1 100755
--- a/ding/model/common/head.py
+++ b/ding/model/common/head.py
@@ -1293,6 +1293,7 @@ def forward(self, key: torch.Tensor, query: torch.Tensor) -> torch.Tensor:
             >>> query = torch.randn(4, 64)
             >>> logit = head(key, query)
             >>> assert logit.shape == torch.Size([4, 5])
+
         .. note::
             In this head, we assume that the ``key`` and ``query`` tensor are both normalized.
         """
diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py
index 0ca8df7fb5..f74a179962 100644
--- a/ding/model/common/utils.py
+++ b/ding/model/common/utils.py
@@ -21,6 +21,7 @@ def create_model(cfg: EasyDict) -> torch.nn.Module:
         >>>     'action_shape': 2,
         >>> })
         >>> model = create_model(cfg)
+
     .. tip::
         This method will not modify the ``cfg`` , it will deepcopy the ``cfg`` and then modify it.
     """
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index b2dd815287..4a63c3dcc6 100755
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -26,3 +26,4 @@
 from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS
 from .bcq import BCQ
 from .edac import EDAC
+from .ebm import EBM, AutoregressiveEBM
diff --git a/ding/model/template/acer.py b/ding/model/template/acer.py
index bb46b22bec..44bb386cba 100644
--- a/ding/model/template/acer.py
+++ b/ding/model/template/acer.py
@@ -85,40 +85,15 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
             Use observation to predict output.
             Parameter updates with ACER's MLPs forward setup.
         Arguments:
-            Forward with ``'compute_actor'``:
-                - inputs (:obj:`torch.Tensor`):
-                The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``.
-                Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``.
-
-            Forward with ``'compute_critic'``, inputs:`torch.Tensor` Necessary Keys:
-                - ``obs`` encoded tensors.
-
             - mode (:obj:`str`): Name of the forward mode.
         Returns:
             - outputs (:obj:`Dict`): Outputs of network forward.
-
-                Forward with ``'compute_actor'``, Necessary Keys (either):
-                    - logit (:obj:`torch.Tensor`):
-                        - logit (:obj:`torch.Tensor`): Logit encoding tensor.
-
-                Forward with ``'compute_critic'``, Necessary Keys:
-                    - q_value (:obj:`torch.Tensor`): Q value tensor.
-        Actor Shapes:
+        Shapes (Actor):
             - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape``
             - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-        Critic Shapes:
+        Shapes (Critic):
             - inputs (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``obs_shape``
             - q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape``
-        Actor Examples:
-            >>> # Regression mode
-            >>> model = ACER(64, 64)
-            >>> inputs = torch.randn(4, 64)
-            >>> actor_outputs = model(inputs,'compute_actor')
-            >>> assert actor_outputs['logit'].shape == torch.Size([4, 64])
-        Critic Examples:
-            >>> inputs = torch.randn(4,N)
-            >>> model = ACER(obs_shape=(N, ),action_shape=5)
-            >>> model(inputs, mode='compute_critic')['q_value']
         """
         assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
         return getattr(self, mode)(inputs)
@@ -127,7 +102,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
         """
         Overview:
             Use encoded embedding tensor to predict output.
-            Execute parameter updates with ``'compute_actor'`` mode
+            Execute parameter updates with ``compute_actor`` mode
             Use encoded embedding tensor to predict output.
         Arguments:
             - inputs (:obj:`torch.Tensor`):
@@ -156,7 +131,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict:
     def compute_critic(self, inputs: torch.Tensor) -> Dict:
         """
         Overview:
-            Execute parameter updates with ``'compute_critic'`` mode
+            Execute parameter updates with ``compute_critic`` mode
             Use encoded embedding tensor to predict output.
         Arguments:
             - ``obs``, ``action`` encoded tensors.
diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py
index ba74b97573..2d72e43d53 100644
--- a/ding/model/template/maqac.py
+++ b/ding/model/template/maqac.py
@@ -1,6 +1,5 @@
 from typing import Union, Dict, Optional
 from easydict import EasyDict
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -96,7 +95,7 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use observation tensor to predict output, with ``'compute_actor'`` or ``'compute_critic'`` mode.
+            Use observation tensor to predict output, with ``compute_actor`` or ``compute_critic`` mode.
         Arguments:
             - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
                 - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
@@ -109,41 +108,11 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
                     - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
                         with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
                         N2 corresponds to ``action_shape``.
+
             - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class.
         Returns:
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-            Forward with ``'compute_actor'``, Necessary Keys (either):
-                - logit (:obj:`torch.Tensor`): Action's probabilities.
-                - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \
-                    and A is agent num. N2 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
-                    A is agent num. N2 corresponds to ``action_shape``.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                        N0 corresponds to ``agent_obs_shape``.
-                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
-                        N1 corresponds to ``global_obs_shape``.
-                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                        N2 corresponds to ``action_shape``.
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-            Forward with ``'compute_actor'``, Necessary Keys (either):
-                - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
-                - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
-                    A is agent num. N2 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \
+                whose key-values vary in different forward modes.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
@@ -181,25 +150,11 @@ def compute_actor(self, inputs: Dict) -> Dict:
                         with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
                         N2 corresponds to ``action_shape``.
         Returns:
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-                - logit (:obj:`torch.Tensor`): Action's probabilities.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \
+                whose key-values vary in different forward modes.
+                - logit (:obj:`torch.Tensor`): Action's output logit (real value range), whose shape is \
+                    :math:`(B, A, N2)`, where N2 corresponds to ``action_shape``.
                 - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                        N0 corresponds to ``agent_obs_shape``.
-                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
-                        N1 corresponds to ``global_obs_shape``.
-                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                        N2 corresponds to ``action_shape``.
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-                - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
-                - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
@@ -237,31 +192,11 @@ def compute_critic(self, inputs: Dict) -> Dict:
                         with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
                         N2 corresponds to ``action_shape``.
         Returns:
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \
-                    and A is agent num. N2 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
-                    A is agent num. N2 corresponds to ``action_shape``.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                        N0 corresponds to ``agent_obs_shape``.
-                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
-                        N1 corresponds to ``global_obs_shape``.
-                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                        N2 corresponds to ``action_shape``.
-            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \
-                key-values vary in different forward modes.
-            if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \
-                    A is agent num. N2 corresponds to ``action_shape``.
-            if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                    N2 corresponds to ``action_shape``.
+            - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \
+                whose key-values vary in different values of ``twin_critic``.
+                - q_value (:obj:`list`): If ``twin_critic=True``, q_value should be 2 elements, each is the shape of \
+                    :math:`(B, A, N2)`, where B is batch size and A is agent num. N2 corresponds to ``action_shape``. \
+                    Otherwise, q_value should be ``torch.Tensor``.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
@@ -397,7 +332,7 @@ def __init__(
     def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
         """
         Overview:
-            Use observation and action tensor to predict output in ``'compute_actor'`` or ``'compute_critic'`` mode.
+            Use observation and action tensor to predict output in ``compute_actor`` or ``compute_critic`` mode.
         Arguments:
             - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
                 - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
@@ -410,54 +345,21 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
                     - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
                         with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
                         N2 corresponds to ``action_shape``.
+
                 - ``action`` (:obj:`torch.Tensor`): The action tensor data, \
                     with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \
                     N3 corresponds to ``action_shape``.
             - mode (:obj:`str`): Name of the forward mode.
         Returns:
-            - outputs (:obj:`Dict`): Outputs of network forward.
-            Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys:
-                - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``.
-            Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys:
-                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
-                    A is agent num. N3 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
-                    A is agent num.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                        N0 corresponds to ``agent_obs_shape``.
-                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
-                        N1 corresponds to ``global_obs_shape``.
-                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                        N2 corresponds to ``action_shape``.
-                - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
-                    N3 corresponds to ``action_shape``.
-            - outputs (:obj:`Dict`): Outputs of network forward.
-            Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys:
-                - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
-                    N3 corresponds to ``action_shape``.
-            Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys:
-                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
-                    A is agent num. N3 corresponds to ``action_shape``.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
-                    A is agent num.
-            Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
+            - outputs (:obj:`Dict`): Outputs of network forward, whose key-values will be different for different \
+                ``mode``, ``twin_critic``, ``action_space``.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
             >>> global_obs_shape = 264
             >>> agent_num = 8
             >>> action_shape = 14
-            >>> action_space = 'regression'
-            >>> # or
-            >>> action_space = 'reparameterization'
+            >>> act_space = 'reparameterization'  # regression
             >>> data = {
             >>>     'obs': {
             >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
@@ -466,7 +368,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict:
             >>>     },
             >>>     'action': torch.randn(B, agent_num, squeeze(action_shape))
             >>> }
-            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False)
             >>> if action_space == 'regression':
             >>>     action = model(data['obs'], mode='compute_actor')['action']
             >>> elif action_space == 'reparameterization':
@@ -485,37 +387,25 @@ def compute_actor(self, inputs: Dict) -> Dict:
                 - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \
                     with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \
                     N0 corresponds to ``agent_obs_shape``.
+
         Returns:
             - outputs (:obj:`Dict`): Outputs of network forward.
-            if action_space == 'regression', Necessary Keys:
-                - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``.
-            if action_space == 'reparameterization', Necessary Keys:
-                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
-                    A is agent num. N3 corresponds to ``action_shape``.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                    N0 corresponds to ``agent_obs_shape``.
-            - outputs (:obj:`Dict`): Outputs of network forward.
-            if action_space == 'regression', Necessary Keys:
-                - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
-                    N3 corresponds to ``action_shape``.
-            if action_space == 'reparameterization', Necessary Keys:
-                - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
-                    A is agent num. N3 corresponds to ``action_shape``.
+        ReturnKeys (``action_space == 'regression'``):
+            - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``.
+        ReturnKeys (``action_space == 'reparameterization'``):
+            - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \
+                A is agent num. N3 corresponds to ``action_shape``.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
             >>> global_obs_shape = 264
             >>> agent_num = 8
             >>> action_shape = 14
-            >>> action_space = 'regression'
-            >>> # or
-            >>> action_space = 'reparameterization'
+            >>> act_space = 'reparameterization'  # 'regression'
             >>> data = {
             >>>     'agent_state': torch.randn(B, agent_num, agent_obs_shape),
             >>> }
-            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False)
             >>> if action_space == 'regression':
             >>>     action = model.compute_actor(data)['action']
             >>> elif action_space == 'reparameterization':
@@ -545,42 +435,25 @@ def compute_critic(self, inputs: Dict) -> Dict:
                     - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \
                         with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \
                         N2 corresponds to ``action_shape``.
+
                 - ``action`` (:obj:`torch.Tensor`): The action tensor data, \
                     with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \
                     N3 corresponds to ``action_shape``.
+
         Returns:
             - outputs (:obj:`Dict`): Outputs of network forward.
-            if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
-                    A is agent num.
-            if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
-        Shapes:
-            - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys:
-                    - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \
-                        N0 corresponds to ``agent_obs_shape``.
-                    - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \
-                        N1 corresponds to ``global_obs_shape``.
-                    - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \
-                        N2 corresponds to ``action_shape``.
-                - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \
-                    N3 corresponds to ``action_shape``.
-            - outputs (:obj:`Dict`): Outputs of network forward.
-            if ``twin_critic`` is ``True``, Necessary Keys:
-                - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
-                    A is agent num.
-            if ``twin_critic`` is ``False``, Necessary Keys:
-                - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
+        ReturnKeys (``twin_critic=True``):
+            - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \
+                A is agent num.
+        ReturnKeys (``twin_critic=False``):
+            - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num.
         Examples:
             >>> B = 32
             >>> agent_obs_shape = 216
             >>> global_obs_shape = 264
             >>> agent_num = 8
             >>> action_shape = 14
-            >>> action_space = 'regression'
-            >>> # or
-            >>> action_space = 'reparameterization'
+            >>> act_space = 'reparameterization'  # 'regression'
             >>> data = {
             >>>     'obs': {
             >>>         'agent_state': torch.randn(B, agent_num, agent_obs_shape),
@@ -589,7 +462,7 @@ def compute_critic(self, inputs: Dict) -> Dict:
             >>>     },
             >>>     'action': torch.randn(B, agent_num, squeeze(action_shape))
             >>> }
-            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False)
+            >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False)
             >>> value = model.compute_critic(data)['q_value']
         """
 
diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py
index cdd521f2b1..78071e6783 100644
--- a/ding/model/template/mavac.py
+++ b/ding/model/template/mavac.py
@@ -52,8 +52,8 @@ def __init__(
             - actor_head_layer_num (:obj:`int`): The num of layers used in the ``actor_head`` network to compute action.
             - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``critic_head`` network, defaults \
                 to 512, it must match the last element of ``global_obs_shape``.
-            - critic_head_layer_num (:obj:`int`):
-                The num of layers used in the network to compute Q value output for critic's nn.
+            - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output for \
+                critic's nn.
             - action_space (:obj:`Union[int, SequenceType]`): The type of different action spaces, including \
                 ['discrete', 'continuous'], then will instantiate corresponding head, including ``DiscreteHead`` \
                 and ``ReparameterizationHead``.
@@ -180,8 +180,7 @@ def compute_actor(self, x: Dict) -> Dict:
                 - action_mask(optional): (:obj:`torch.Tensor`): When ``action_space`` is discrete, action_mask needs \
                     to be provided to mask illegal actions.
         Returns:
-            - outputs (:obj:`Dict`):
-                The output dict of MAVAC's forward computation graph for actor, including ``logit``.
+            - outputs (:obj:`Dict`): The output dict of the forward computation graph for actor, including ``logit``.
         ReturnsKeys:
             - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \
                 the same dimension real-value ranged tensor of possible action choices, and for continuous action \
@@ -253,7 +252,7 @@ def compute_actor_critic(self, x: Dict) -> Dict:
             MAVAC forward computation graph for both actor and critic part, input observation to predict action \
             logit and state value.
         Arguments:
-            - x (:obj:Dict): The input dict contains ``agent_state``, ``global_state`` and other related info.
+            - x (:obj:`Dict`): The input dict contains ``agent_state``, ``global_state`` and other related info.
         Returns:
             - outputs (:obj:`Dict`): The output dict of MAVAC's forward computation graph for both actor and critic, \
                 including ``logit`` and ``value``.
diff --git a/ding/model/template/vae.py b/ding/model/template/vae.py
index 9839f0e905..f3181361c7 100644
--- a/ding/model/template/vae.py
+++ b/ding/model/template/vae.py
@@ -184,22 +184,17 @@ def forward(self, input: Dict[str, Tensor], **kwargs) -> dict:
             'z': z
         }
 
-    def loss_function(self, args: Dict[str, Tensor], **kwargs) -> dict:
+    def loss_function(self, args: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]:
         """
         Overview:
             Computes the VAE loss function.
-            KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2}
         Arguments:
-            - args (:obj:`Dict`): Dict containing keywords `recons_action` (:obj:`torch.Tensor`) \
-                and `prediction_residual` (:obj:`torch.Tensor`), `original_action` (:obj:`torch.Tensor`), \
-                `mu` (:obj:`torch.Tensor`), `log_var` (:obj:`torch.Tensor`) and \
-                `true_residual` (:obj:`torch.Tensor`).
-            - kwargs (:obj:`Dict`): Dict containing keywords `kld_weight` (:obj:`torch.Tensor`) \
-                and `predict_weight` (:obj:`torch.Tensor`).
+            - args (:obj:`Dict[str, Tensor]`): Dict containing keywords ``recons_action``, ``prediction_residual`` \
+                ``original_action``, ``mu``, ``log_var`` and ``true_residual``.
+            - kwargs (:obj:`Dict`): Dict containing keywords ``kld_weight`` and ``predict_weight``.
         Returns:
-            - outputs (:obj: `Dict`): Dict containing keywords `loss` \
-                (`obj`:`torch.Tensor`), `reconstruction_loss` (:obj: `torch.Tensor`), \
-                `kld_loss` (:obj: `torch.Tensor`) and `predict_loss` (:obj: `torch.Tensor`).
+            - outputs (:obj:`Dict[str, Tensor]`): Dict containing different ``loss`` results, including ``loss``, \
+                ``reconstruction_loss``, ``kld_loss``, ``predict_loss``.
         Shapes:
             - recons_action (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size \
                 and A is ``action dim``.
diff --git a/ding/policy/mbpolicy/mbsac.py b/ding/policy/mbpolicy/mbsac.py
index 7af83021e2..1918e161db 100644
--- a/ding/policy/mbpolicy/mbsac.py
+++ b/ding/policy/mbpolicy/mbsac.py
@@ -36,7 +36,6 @@ class MBSACPolicy(SACPolicy):
         == ====================   ========    =============  ==================================
 
     .. note::
-
         For other configs, please refer to ding.policy.sac.SACPolicy.
     """
 
diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py
index 5fec46b821..8738ee2d81 100644
--- a/ding/reward_model/pwil_irl_model.py
+++ b/ding/reward_model/pwil_irl_model.py
@@ -45,10 +45,7 @@ class PwilRewardModel(BaseRewardModel):
            | ``path``                   .pkl          |                                        | file
         3  | ``sample_size``    int    1000           | sample data from expert dataset        |
                                                       | with fixed size                        |
-        4  | ``alpha``          int    5              | factor alpha                           | r = alpha * exp(
-                                                                                               | (-beta*T/sqrt(
-                                                                                               | |s_size|+ |a_size|)
-                                                                                               | )*c_i)
+        4  | ``alpha``          int    5              | factor alpha                           |
         5  | ``beta``           int    5              | factor beta                            |
         6  | ``s_size``         int    4              | state size                             |
         7  | ``a_size``         int    2              | action size                            |
diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py
index 7c5b995eaa..4dd2df6c4b 100644
--- a/ding/rl_utils/td.py
+++ b/ding/rl_utils/td.py
@@ -722,8 +722,6 @@ def bdq_nstep_td_error(
         Deep Reinforcement Learning", link: https://arxiv.org/pdf/1711.08946.
         In fact, the original paper only provides the 1-step TD-error calculation method, and here we extend the \
         calculation method of n-step, i.e., TD-error:
-        :math:`y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))`
-        :math:`TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2`
     Arguments:
         - data (:obj:`q_nstep_td_data`): The input data, q_nstep_td_data to calculate loss
         - gamma (:obj:`float`): Discount factor