hill-a · araffin · Oct 11, 2019 · Jan 3, 2019 · Jan 3, 2019 · Jun 24, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -34,6 +34,9 @@ jobs:
  - name: "Unit Tests sb-z"
  env: TEST_GLOB="{s[b-z]*,[t-z]*}"
 
+ - name: "Unit Tests determinism"
+ env: TEST_GLOB="0deterministic.py"
+
  - name: "Sphinx Documentation"
  script:
  - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"'

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -6,20 +6,27 @@ Changelog
 For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.
 
 
-Pre-Release 2.8.1a0 (WIP)
+Pre-Release 2.9.0a0 (WIP)
 --------------------------
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- The `seed` argument has been moved from `learn()` method to model constructor
+ in order to have reproducible results
 
 New Features:
 ^^^^^^^^^^^^^
+- Add `n_cpu_tf_sess` to model constructor to choose the number of threads used by Tensorflow
 
 Bug Fixes:
 ^^^^^^^^^^
+- Fix seeding, so it is now possible to have deterministic results on cpu
+- Fix a bug in DDPG where `predict` method with `deterministic=False` would fail
 
 Deprecations:
 ^^^^^^^^^^^^^
+- `nprocs` (ACKTR) and `num_procs` (ACER) are deprecated in favor of `n_cpu_tf_sess` which is now common
+ to all algorithms
 
 Others:
 ^^^^^^^
@@ -508,4 +515,4 @@ In random order...
 Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
 @EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol
 @XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs
-@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150
+@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp
diff --git a/setup.py b/setup.py
@@ -140,7 +140,7 @@
  license="MIT",
  long_description=long_description,
  long_description_content_type='text/markdown',
- version="2.8.0",
+ version="2.9.0a0",
  )
 
 # python setup.py sdist

diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py
@@ -20,4 +20,4 @@
  from stable_baselines.trpo_mpi import TRPO
 del mpi4py
 
-__version__ = "2.8.0"
+__version__ = "2.9.0a0"
diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py
@@ -38,14 +38,21 @@ class A2C(ActorCriticRLModel):
  :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
  :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
  WARNING: this logging can take a lot of space quickly
+ :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
+ Note that if you want completely deterministic results, you must set
+ `n_cpu_tf_sess` to 1
+ :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
+ If None, the number of cpu of the current machine will be used.
  """
 
  def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5,
- learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0, tensorboard_log=None,
- _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
+ learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0,
+ tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
+ full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None):
 
  super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
- _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+ _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+ seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
  self.n_steps = n_steps
  self.gamma = gamma
@@ -99,7 +106,8 @@ def setup_model(self):
 
  self.graph = tf.Graph()
  with self.graph.as_default():
- self.sess = tf_util.make_session(graph=self.graph)
+ self.set_random_seed(self.seed)
+ self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
 
  self.n_batch = self.n_envs * self.n_steps
 
@@ -216,15 +224,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
  return policy_loss, value_loss, policy_entropy
 
- def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
+ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C",
  reset_num_timesteps=True):
 
  new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
  with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
  as writer:
- self._setup_learn(seed)
-
+ self._setup_learn()
  self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
  schedule=self.lr_schedule)
 
@@ -288,6 +295,8 @@ def save(self, save_path, cloudpickle=False):
  "observation_space": self.observation_space,
  "action_space": self.action_space,
  "n_envs": self.n_envs,
+ "n_cpu_tf_sess": self.n_cpu_tf_sess,
+ "seed": self.seed,
  "_vectorize_action": self._vectorize_action,
  "policy_kwargs": self.policy_kwargs
  }

diff --git a/stable_baselines/a2c/run_atari.py b/stable_baselines/a2c/run_atari.py
@@ -30,8 +30,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
 
  env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
 
- model = A2C(policy_fn, env, lr_schedule=lr_schedule)
- model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+ model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
+ model.learn(total_timesteps=int(num_timesteps * 1.1))
  env.close()
 
 

diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py
@@ -69,6 +69,10 @@ class ACER(ActorCriticRLModel):
  :param n_steps: (int) The number of steps to run for each environment per update
  (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
  :param num_procs: (int) The number of threads for TensorFlow operations
+
+ .. deprecated:: 2.9.0
+ Use `n_cpu_tf_sess` instead.
+
  :param q_coef: (float) The weight for the loss on the Q value
  :param ent_coef: (float) The weight for the entropic loss
  :param max_grad_norm: (float) The clipping value for the maximum gradient
@@ -93,16 +97,23 @@ class ACER(ActorCriticRLModel):
  :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
  :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
  WARNING: this logging can take a lot of space quickly
+ :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
+ Note that if you want completely deterministic results, you must set
+ `n_cpu_tf_sess` to 1
+ :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
+ If None, the number of cpu of the current machine will be used.
  """
 
- def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
+ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
  learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000,
  replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True,
  alpha=0.99, delta=1, verbose=0, tensorboard_log=None,
- _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
+ _init_setup_model=True, policy_kwargs=None,
+ full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):
 
  super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
- _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+ _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+ seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
  self.n_steps = n_steps
  self.replay_ratio = replay_ratio
@@ -120,10 +131,14 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5,
  self.rprop_epsilon = rprop_epsilon
  self.learning_rate = learning_rate
  self.lr_schedule = lr_schedule
- self.num_procs = num_procs
  self.tensorboard_log = tensorboard_log
  self.full_tensorboard_log = full_tensorboard_log
 
+ if num_procs is not None:
+ warnings.warn("num_procs will be removed in a future version (v3.x.x) "
+ "use n_cpu_tf_sess instead", DeprecationWarning)
+ self.n_cpu_tf_sess = num_procs
+
  self.graph = None
  self.sess = None
  self.action_ph = None
@@ -184,8 +199,8 @@ def setup_model(self):
 
  self.graph = tf.Graph()
  with self.graph.as_default():
- self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph)
-
+ self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
+ self.set_random_seed(self.seed)
  n_batch_step = None
  if issubclass(self.policy, RecurrentActorCriticPolicy):
  n_batch_step = self.n_envs
@@ -457,14 +472,14 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w
 
  return self.names_ops, step_return[1:] # strip off _train
 
- def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER",
+ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER",
  reset_num_timesteps=True):
 
  new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
  with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
  as writer:
- self._setup_learn(seed)
+ self._setup_learn()
 
  self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
  schedule=self.lr_schedule)
@@ -562,6 +577,8 @@ def save(self, save_path, cloudpickle=False):
  "observation_space": self.observation_space,
  "action_space": self.action_space,
  "n_envs": self.n_envs,
+ 'n_cpu_tf_sess': self.n_cpu_tf_sess,
+ 'seed': self.seed,
  "_vectorize_action": self._vectorize_action,
  "policy_kwargs": self.policy_kwargs
  }

diff --git a/stable_baselines/acer/run_atari.py b/stable_baselines/acer/run_atari.py
@@ -28,8 +28,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
  warnings.warn("Policy {} not implemented".format(policy))
  return
 
- model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
- model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+ model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000, seed=seed)
+ model.learn(total_timesteps=int(num_timesteps * 1.1))
  env.close()
  # Free memory
  del model

diff --git a/stable_baselines/acktr/acktr.py b/stable_baselines/acktr/acktr.py
@@ -1,4 +1,5 @@
 import time
+import warnings
 from collections import deque
 
 import numpy as np
@@ -24,6 +25,10 @@ class ACKTR(ActorCriticRLModel):
  :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
  :param gamma: (float) Discount factor
  :param nprocs: (int) The number of threads for TensorFlow operations
+
+ .. deprecated:: 2.9.0
+ Use `n_cpu_tf_sess` instead.
+
  :param n_steps: (int) The number of steps to run for each environment
  :param ent_coef: (float) The weight for the entropic loss
  :param vf_coef: (float) The weight for the loss on the value function
@@ -45,13 +50,14 @@ class ACKTR(ActorCriticRLModel):
  WARNING: this logging can take a lot of space quickly
  """
 
- def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
+ def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
  learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
  tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1,
- gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False):
+ gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):
 
  super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
- _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
+ _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
+ seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)
 
  self.n_steps = n_steps
  self.gamma = gamma
@@ -62,7 +68,12 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01,
  self.max_grad_norm = max_grad_norm
  self.learning_rate = learning_rate
  self.lr_schedule = lr_schedule
- self.nprocs = nprocs
+
+ if nprocs is not None:
+ warnings.warn("nprocs will be removed in a future version (v3.x.x) "
+ "use n_cpu_tf_sess instead", DeprecationWarning)
+ self.n_cpu_tf_sess = nprocs
+
  self.tensorboard_log = tensorboard_log
  self.async_eigen_decomp = async_eigen_decomp
  self.full_tensorboard_log = full_tensorboard_log
@@ -119,7 +130,8 @@ def setup_model(self):
 
  self.graph = tf.Graph()
  with self.graph.as_default():
- self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph)
+ self.set_random_seed(self.seed)
+ self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
 
  n_batch_step = None
  n_batch_train = None
@@ -264,14 +276,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ
 
  return policy_loss, value_loss, policy_entropy
 
- def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR",
+ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR",
  reset_num_timesteps=True):
 
  new_tb_log = self._init_num_timesteps(reset_num_timesteps)
 
  with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
  as writer:
- self._setup_learn(seed)
+ self._setup_learn()
  self.n_batch = self.n_envs * self.n_steps
 
  self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
@@ -371,7 +383,6 @@ def save(self, save_path, cloudpickle=False):
  data = {
  "gamma": self.gamma,
  "gae_lambda": self.gae_lambda,
- "nprocs": self.nprocs,
  "n_steps": self.n_steps,
  "vf_coef": self.vf_coef,
  "ent_coef": self.ent_coef,
@@ -385,6 +396,8 @@ def save(self, save_path, cloudpickle=False):
  "observation_space": self.observation_space,
  "action_space": self.action_space,
  "n_envs": self.n_envs,
+ "n_cpu_tf_sess": self.n_cpu_tf_sess,
+ "seed": self.seed,
  "kfac_update": self.kfac_update,
  "_vectorize_action": self._vectorize_action,
  "policy_kwargs": self.policy_kwargs

diff --git a/stable_baselines/acktr/run_atari.py b/stable_baselines/acktr/run_atari.py
@@ -14,8 +14,8 @@ def train(env_id, num_timesteps, seed, num_cpu):
  :param num_cpu: (int) The number of cpu to train on
  """
  env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
- model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
- model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
+ model = ACKTR(CnnPolicy, env, nprocs=num_cpu, seed=seed)
+ model.learn(total_timesteps=int(num_timesteps * 1.1))
  env.close()