facebookresearch · czxttkl · Apr 2, 2022
diff --git a/reagent/gym/tests/configs/cartpole/discrete_c51_cartpole_online.yaml b/reagent/gym/tests/configs/cartpole/discrete_c51_cartpole_online.yaml
@@ -13,7 +13,6 @@ model:
         maxq_learning: true
         temperature: 1.0
       double_q_learning: true
-      minibatches_per_step: 1
       num_atoms: 21
       qmin: 0
       qmax: 40

diff --git a/reagent/gym/tests/configs/cartpole/discrete_dqn_cartpole_online.yaml b/reagent/gym/tests/configs/cartpole/discrete_dqn_cartpole_online.yaml
@@ -13,7 +13,6 @@ model:
         maxq_learning: true
         temperature: 1.0
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.01

diff --git a/reagent/gym/tests/configs/cartpole/discrete_qr_cartpole_online.yaml b/reagent/gym/tests/configs/cartpole/discrete_qr_cartpole_online.yaml
@@ -13,7 +13,6 @@ model:
         maxq_learning: true
         temperature: 1.0
       double_q_learning: true
-      minibatches_per_step: 1
       num_atoms: 11
       optimizer:
         AdamW:

diff --git a/reagent/gym/tests/configs/cartpole/parametric_dqn_cartpole_online.yaml b/reagent/gym/tests/configs/cartpole/parametric_dqn_cartpole_online.yaml
@@ -10,7 +10,6 @@ model:
         maxq_learning: true
         temperature: 1.0
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         AdamW:
           lr: 0.001

diff --git a/reagent/gym/tests/configs/cartpole/parametric_sarsa_cartpole_online.yaml b/reagent/gym/tests/configs/cartpole/parametric_sarsa_cartpole_online.yaml
@@ -14,7 +14,6 @@ model:
         maxq_learning: false
         temperature: 0.35
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.05

diff --git a/reagent/gym/tests/configs/functionality/dqn_possible_actions_mask.yaml b/reagent/gym/tests/configs/functionality/dqn_possible_actions_mask.yaml
@@ -15,7 +15,6 @@ model:
         maxq_learning: true
         temperature: 1.0
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.05

diff --git a/reagent/gym/tests/configs/open_gridworld/discrete_dqn_open_gridworld.yaml b/reagent/gym/tests/configs/open_gridworld/discrete_dqn_open_gridworld.yaml
@@ -20,7 +20,6 @@ model:
         temperature: 0.01
         q_network_loss: mse
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.01

diff --git a/reagent/gym/tests/configs/sparse/discrete_dqn_changing_arms_online.yaml b/reagent/gym/tests/configs/sparse/discrete_dqn_changing_arms_online.yaml
@@ -17,7 +17,6 @@ model:
         maxq_learning: true
         temperature: 10.0
       double_q_learning: true
-      minibatches_per_step: 1
       optimizer:
         AdamW:
           lr: 0.005

diff --git a/reagent/gym/tests/configs/world_model/discrete_dqn_string.yaml b/reagent/gym/tests/configs/world_model/discrete_dqn_string.yaml
@@ -31,8 +31,6 @@ train_model:
         maxq_learning: true
         q_network_loss: mse
       double_q_learning: true
-      minibatch_size: 1024
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.001

diff --git a/reagent/gym/tests/test_gym.py b/reagent/gym/tests/test_gym.py
@@ -191,7 +191,7 @@ def run_test_replay_buffer(
     passing_score_bar: float,
     num_eval_episodes: int,
     use_gpu: bool,
-    minibatch_size: Optional[int] = None,
+    minibatch_size: int,
 ):
     """
     Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts.
@@ -212,13 +212,6 @@ def run_test_replay_buffer(
     )
     training_policy = manager.create_policy(trainer, serving=False)
 
-    if not isinstance(trainer, pl.LightningModule):
-        if minibatch_size is None:
-            minibatch_size = trainer.minibatch_size
-        assert minibatch_size == trainer.minibatch_size
-
-    assert minibatch_size is not None
-
     replay_buffer = ReplayBuffer(
         replay_capacity=replay_memory_size, batch_size=minibatch_size
     )

diff --git a/reagent/training/c51_trainer.py b/reagent/training/c51_trainer.py
@@ -28,8 +28,6 @@ def __init__(
         actions: List[str] = field(default_factory=list),  # noqa: B008
         rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
         double_q_learning: bool = True,
-        minibatch_size: int = 1024,
-        minibatches_per_step: int = 1,
         num_atoms: int = 51,
         qmin: float = -100,
         qmax: float = 200,
@@ -45,9 +43,6 @@ def __init__(
             rl (optional): an instance of the RLParameter class, which
                 defines relevant hyperparameters
             double_q_learning (optional): whether or not double Q learning, enabled by default,
-            minibatch_size (optional): the size of the minibatch
-            minibatches_per_step (optional): the number of minibatch updates
-                per training step
             num_atoms (optional): number of "canonical returns"in the discretized value distributions
             qmin (optional): minimum q-value
             qmax (optional): maximum q-value
@@ -56,8 +51,6 @@ def __init__(
         """
         super().__init__()
         self.double_q_learning = double_q_learning
-        self.minibatch_size = minibatch_size
-        self.minibatches_per_step = minibatches_per_step
         self._actions = actions
         self.q_network = q_network
         self.q_network_target = q_network_target

diff --git a/reagent/training/dqn_trainer.py b/reagent/training/dqn_trainer.py
@@ -43,8 +43,6 @@ def __init__(
         rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
         double_q_learning: bool = True,
         bcq: Optional[BCQConfig] = None,
-        minibatch_size: int = 1024,
-        minibatches_per_step: int = 1,
         optimizer: Optimizer__Union = field(  # noqa: B008
             default_factory=Optimizer__Union.default
         ),
@@ -62,8 +60,6 @@ def __init__(
             rl: RLParameters
             double_q_learning: boolean flag to use double-q learning
             bcq: a config file for batch-constrained q-learning, defaults to normal
-            minibatch_size: samples per minibatch
-            minibatches_per_step: minibatch updates per step
             optimizer: q-network optimizer
             evaluation: evaluation params, primarily whether to use CPE in eval or not
         """
@@ -75,8 +71,6 @@ def __init__(
         )
         assert self._actions is not None, "Discrete-action DQN needs action names"
         self.double_q_learning = double_q_learning
-        self.minibatch_size = minibatch_size
-        self.minibatches_per_step = minibatches_per_step or 1
 
         self.q_network = q_network
         self.q_network_target = q_network_target

diff --git a/reagent/training/parametric_dqn_trainer.py b/reagent/training/parametric_dqn_trainer.py
@@ -28,7 +28,6 @@ def __init__(
         # Start ParametricDQNTrainerParameters
         rl: rlp.RLParameters = field(default_factory=rlp.RLParameters),  # noqa: B008
         double_q_learning: bool = True,
-        minibatches_per_step: int = 1,
         optimizer: Optimizer__Union = field(  # noqa: B008
             default_factory=Optimizer__Union.default
         ),
@@ -38,7 +37,6 @@ def __init__(
         self.rl_parameters = rl
 
         self.double_q_learning = double_q_learning
-        self.minibatches_per_step = minibatches_per_step or 1
 
         self.q_network = q_network
         self.q_network_target = q_network_target

diff --git a/reagent/training/qrdqn_trainer.py b/reagent/training/qrdqn_trainer.py
@@ -37,8 +37,6 @@ def __init__(
         rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
         double_q_learning: bool = True,
         num_atoms: int = 51,
-        minibatch_size: int = 1024,
-        minibatches_per_step: int = 1,
         optimizer: Optimizer__Union = field(  # noqa: B008
             default_factory=Optimizer__Union.default
         ),
@@ -57,8 +55,6 @@ def __init__(
         )
         # TODO: check to ensure no rl parameter value is set that isn't actively used by class
         self.double_q_learning = double_q_learning
-        self.minibatch_size = minibatch_size
-        self.minibatches_per_step = minibatches_per_step
         self._actions = actions
 
         self.q_network = q_network

diff --git a/reagent/training/td3_trainer.py b/reagent/training/td3_trainer.py
@@ -37,11 +37,9 @@ def __init__(
         actor_network_optimizer: Optimizer__Union = field(  # noqa: B008
             default_factory=Optimizer__Union.default
         ),
-        minibatch_size: int = 64,
         noise_variance: float = 0.2,
         noise_clip: float = 0.5,
         delayed_policy_update: int = 2,
-        minibatches_per_step: int = 1,
     ) -> None:
         """
         Args:
@@ -54,20 +52,15 @@ def __init__(
             q_network_optimizer (optional): the optimizer class and
                 optimizer hyperparameters for the q network(s) optimizer
             actor_network_optimizer (optional): see q_network_optimizer
-            minibatch_size (optional): the size of the minibatch
             noise_variance (optional): the variance of action noise added to smooth
                 q-value estimates
             noise_clip (optional): the maximum absolute value of action noise added
                 to smooth q-value estimates
             delayed_policy_update (optional): the ratio of q network updates
                 to target and policy network updates
-            minibatches_per_step (optional, TODO: currently unused): the number of minibatch updates
-                per training step
         """
         super().__init__()
         self.rl_parameters = rl
-        self.minibatch_size = minibatch_size
-        self.minibatches_per_step = minibatches_per_step or 1
 
         self.q1_network = q1_network
         self.q1_network_target = copy.deepcopy(self.q1_network)

diff --git a/reagent/workflow/sample_configs/discrete_dqn_cartpole_offline.yaml b/reagent/workflow/sample_configs/discrete_dqn_cartpole_offline.yaml
@@ -19,8 +19,6 @@ model:
         softmax_policy: false
         q_network_loss: mse
       double_q_learning: true
-      minibatch_size: 512
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.01

diff --git a/serving/examples/ecommerce/training/contextual_bandit.yaml b/serving/examples/ecommerce/training/contextual_bandit.yaml
@@ -17,8 +17,6 @@ model:
         softmax_policy: false
         q_network_loss: mse
       double_q_learning: true
-      minibatch_size: 128
-      minibatches_per_step: 1
       optimizer:
         Adam:
           lr: 0.01