Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Determinism #492

Merged
merged 22 commits into from
Oct 11, 2019
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ jobs:
- name: "Unit Tests sb-z"
env: TEST_GLOB="{s[b-z]*,[t-z]*}"

- name: "Unit Tests determinism"
env: TEST_GLOB="0deterministic.py"

- name: "Sphinx Documentation"
script:
- 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"'
Expand Down
11 changes: 9 additions & 2 deletions docs/misc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,27 @@ Changelog
For download links, please look at `Github release page <https://github.com/hill-a/stable-baselines/releases>`_.


Pre-Release 2.8.1a0 (WIP)
Pre-Release 2.9.0a0 (WIP)
--------------------------

Breaking Changes:
^^^^^^^^^^^^^^^^^
- The `seed` argument has been moved from `learn()` method to model constructor
in order to have reproducible results

New Features:
^^^^^^^^^^^^^
- Add `n_cpu_tf_sess` to model constructor to choose the number of threads used by Tensorflow

Bug Fixes:
^^^^^^^^^^
- Fix seeding, so it is now possible to have deterministic results on cpu
- Fix a bug in DDPG where `predict` method with `deterministic=False` would fail

Deprecations:
^^^^^^^^^^^^^
- `nprocs` (ACKTR) and `num_procs` (ACER) are deprecated in favor of `n_cpu_tf_sess` which is now common
to all algorithms

Others:
^^^^^^^
Expand Down Expand Up @@ -508,4 +515,4 @@ In random order...
Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck
@EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol
@XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs
@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150
@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@
license="MIT",
long_description=long_description,
long_description_content_type='text/markdown',
version="2.8.0",
version="2.9.0a0",
)

# python setup.py sdist
Expand Down
2 changes: 1 addition & 1 deletion stable_baselines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
from stable_baselines.trpo_mpi import TRPO
del mpi4py

__version__ = "2.8.0"
__version__ = "2.9.0a0"
23 changes: 16 additions & 7 deletions stable_baselines/a2c/a2c.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,21 @@ class A2C(ActorCriticRLModel):
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param full_tensorboard_log: (bool) enable additional logging when using tensorboard
WARNING: this logging can take a lot of space quickly
:param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
Note that if you want completely deterministic results, you must set
Miffyli marked this conversation as resolved.
Show resolved Hide resolved
`n_cpu_tf_sess` to 1
:param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
If None, the number of cpu of the current machine will be used.
"""

def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5,
learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0, tensorboard_log=None,
_init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0,
tensorboard_log=None, _init_setup_model=True, policy_kwargs=None,
full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None):

super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

self.n_steps = n_steps
self.gamma = gamma
Expand Down Expand Up @@ -99,7 +106,8 @@ def setup_model(self):

self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf_util.make_session(graph=self.graph)
self.set_random_seed(self.seed)
self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

self.n_batch = self.n_envs * self.n_steps

Expand Down Expand Up @@ -216,15 +224,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ

return policy_loss, value_loss, policy_entropy

def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C",
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C",
reset_num_timesteps=True):

new_tb_log = self._init_num_timesteps(reset_num_timesteps)

with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
as writer:
self._setup_learn(seed)

self._setup_learn()
self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
schedule=self.lr_schedule)

Expand Down Expand Up @@ -288,6 +295,8 @@ def save(self, save_path, cloudpickle=False):
"observation_space": self.observation_space,
"action_space": self.action_space,
"n_envs": self.n_envs,
"n_cpu_tf_sess": self.n_cpu_tf_sess,
"seed": self.seed,
"_vectorize_action": self._vectorize_action,
"policy_kwargs": self.policy_kwargs
}
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/a2c/run_atari.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):

env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)

model = A2C(policy_fn, env, lr_schedule=lr_schedule)
model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed)
model.learn(total_timesteps=int(num_timesteps * 1.1))
env.close()


Expand Down
33 changes: 25 additions & 8 deletions stable_baselines/acer/acer_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ class ACER(ActorCriticRLModel):
:param n_steps: (int) The number of steps to run for each environment per update
(i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel)
:param num_procs: (int) The number of threads for TensorFlow operations

.. deprecated:: 2.9.0
Use `n_cpu_tf_sess` instead.

:param q_coef: (float) The weight for the loss on the Q value
:param ent_coef: (float) The weight for the entropic loss
:param max_grad_norm: (float) The clipping value for the maximum gradient
Expand All @@ -93,16 +97,23 @@ class ACER(ActorCriticRLModel):
:param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
:param full_tensorboard_log: (bool) enable additional logging when using tensorboard
WARNING: this logging can take a lot of space quickly
:param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
Note that if you want completely deterministic results, you must set
`n_cpu_tf_sess` to 1
:param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
If None, the number of cpu of the current machine will be used.
"""

def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=10,
learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000,
replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True,
alpha=0.99, delta=1, verbose=0, tensorboard_log=None,
_init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False):
_init_setup_model=True, policy_kwargs=None,
full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):

super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

self.n_steps = n_steps
self.replay_ratio = replay_ratio
Expand All @@ -120,10 +131,14 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5,
self.rprop_epsilon = rprop_epsilon
self.learning_rate = learning_rate
self.lr_schedule = lr_schedule
self.num_procs = num_procs
self.tensorboard_log = tensorboard_log
self.full_tensorboard_log = full_tensorboard_log

if num_procs is not None:
warnings.warn("num_procs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
self.n_cpu_tf_sess = num_procs

self.graph = None
self.sess = None
self.action_ph = None
Expand Down Expand Up @@ -184,8 +199,8 @@ def setup_model(self):

self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph)

self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)
self.set_random_seed(self.seed)
n_batch_step = None
if issubclass(self.policy, RecurrentActorCriticPolicy):
n_batch_step = self.n_envs
Expand Down Expand Up @@ -457,14 +472,14 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w

return self.names_ops, step_return[1:] # strip off _train

def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER",
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER",
reset_num_timesteps=True):

new_tb_log = self._init_num_timesteps(reset_num_timesteps)

with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
as writer:
self._setup_learn(seed)
self._setup_learn()

self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
schedule=self.lr_schedule)
Expand Down Expand Up @@ -562,6 +577,8 @@ def save(self, save_path, cloudpickle=False):
"observation_space": self.observation_space,
"action_space": self.action_space,
"n_envs": self.n_envs,
'n_cpu_tf_sess': self.n_cpu_tf_sess,
'seed': self.seed,
"_vectorize_action": self._vectorize_action,
"policy_kwargs": self.policy_kwargs
}
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/acer/run_atari.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
warnings.warn("Policy {} not implemented".format(policy))
return

model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000, seed=seed)
model.learn(total_timesteps=int(num_timesteps * 1.1))
env.close()
# Free memory
del model
Expand Down
29 changes: 21 additions & 8 deletions stable_baselines/acktr/acktr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import warnings
from collections import deque

import numpy as np
Expand All @@ -24,6 +25,10 @@ class ACKTR(ActorCriticRLModel):
:param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
:param gamma: (float) Discount factor
:param nprocs: (int) The number of threads for TensorFlow operations

.. deprecated:: 2.9.0
Use `n_cpu_tf_sess` instead.

:param n_steps: (int) The number of steps to run for each environment
:param ent_coef: (float) The weight for the entropic loss
:param vf_coef: (float) The weight for the loss on the value function
Expand All @@ -45,13 +50,14 @@ class ACKTR(ActorCriticRLModel):
WARNING: this logging can take a lot of space quickly
"""

def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0,
learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1,
gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False):
gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1):

super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True,
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs)
_init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs,
seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

self.n_steps = n_steps
self.gamma = gamma
Expand All @@ -62,7 +68,12 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01,
self.max_grad_norm = max_grad_norm
self.learning_rate = learning_rate
self.lr_schedule = lr_schedule
self.nprocs = nprocs

if nprocs is not None:
warnings.warn("nprocs will be removed in a future version (v3.x.x) "
"use n_cpu_tf_sess instead", DeprecationWarning)
self.n_cpu_tf_sess = nprocs

self.tensorboard_log = tensorboard_log
self.async_eigen_decomp = async_eigen_decomp
self.full_tensorboard_log = full_tensorboard_log
Expand Down Expand Up @@ -119,7 +130,8 @@ def setup_model(self):

self.graph = tf.Graph()
with self.graph.as_default():
self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph)
self.set_random_seed(self.seed)
self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

n_batch_step = None
n_batch_train = None
Expand Down Expand Up @@ -264,14 +276,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ

return policy_loss, value_loss, policy_entropy

def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR",
def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR",
reset_num_timesteps=True):

new_tb_log = self._init_num_timesteps(reset_num_timesteps)

with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
as writer:
self._setup_learn(seed)
self._setup_learn()
self.n_batch = self.n_envs * self.n_steps

self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
Expand Down Expand Up @@ -371,7 +383,6 @@ def save(self, save_path, cloudpickle=False):
data = {
"gamma": self.gamma,
"gae_lambda": self.gae_lambda,
"nprocs": self.nprocs,
"n_steps": self.n_steps,
"vf_coef": self.vf_coef,
"ent_coef": self.ent_coef,
Expand All @@ -385,6 +396,8 @@ def save(self, save_path, cloudpickle=False):
"observation_space": self.observation_space,
"action_space": self.action_space,
"n_envs": self.n_envs,
"n_cpu_tf_sess": self.n_cpu_tf_sess,
"seed": self.seed,
"kfac_update": self.kfac_update,
"_vectorize_action": self._vectorize_action,
"policy_kwargs": self.policy_kwargs
Expand Down
4 changes: 2 additions & 2 deletions stable_baselines/acktr/run_atari.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def train(env_id, num_timesteps, seed, num_cpu):
:param num_cpu: (int) The number of cpu to train on
"""
env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
model = ACKTR(CnnPolicy, env, nprocs=num_cpu, seed=seed)
model.learn(total_timesteps=int(num_timesteps * 1.1))
env.close()


Expand Down
Loading