diff --git a/.travis.yml b/.travis.yml index fc7de8fd41..2f22d84259 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,6 +34,9 @@ jobs: - name: "Unit Tests sb-z" env: TEST_GLOB="{s[b-z]*,[t-z]*}" + - name: "Unit Tests determinism" + env: TEST_GLOB="0deterministic.py" + - name: "Sphinx Documentation" script: - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"' diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst index 0f3df4da34..9581a4ac1b 100644 --- a/docs/guide/algos.rst +++ b/docs/guide/algos.rst @@ -53,3 +53,26 @@ Actions ``gym.spaces``: Some logging values (like `ep_rewmean`, `eplenmean`) are only available when using a Monitor wrapper See `Issue #339 `_ for more info. + + +Reproducibility +--------------- + +Completely reproducible results are not guaranteed across Tensorflow releases or different platforms. +Furthermore, results need not be reproducible between CPU and GPU executions, even when using identical seeds. + +In order to make make computations deterministic on CPU, on your specific problem on one specific platform, +you need to pass a `seed` argument at the creation of a model and set `n_cpu_tf_sess=1` (number of cpu for Tensorflow session). +If you pass an environment to the model using `set_env()`, then you also need to seed the environment first. + +.. note:: + + Because of the current limits of Tensorflow 1.x, we cannot ensure reproducible results on the GPU yet. We hope to solve that issue with Tensorflow 2.x support (cf `Issue #366 `_). + + +.. note:: + + TD3 sometimes fail to have reproducible results for obscure reasons, even when following the previous steps (cf `PR #492 `_). If you find the reason then please open an issue ;) + + +Credit: part of the *Reproducibility* section comes from `PyTorch Documentation `_ diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 4937c2a8c9..e7ad12a0a8 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -6,20 +6,27 @@ Changelog For download links, please look at `Github release page `_. -Pre-Release 2.8.1a0 (WIP) +Pre-Release 2.9.0a0 (WIP) -------------------------- Breaking Changes: ^^^^^^^^^^^^^^^^^ +- The `seed` argument has been moved from `learn()` method to model constructor + in order to have reproducible results New Features: ^^^^^^^^^^^^^ +- Add `n_cpu_tf_sess` to model constructor to choose the number of threads used by Tensorflow Bug Fixes: ^^^^^^^^^^ +- Fix seeding, so it is now possible to have deterministic results on cpu +- Fix a bug in DDPG where `predict` method with `deterministic=False` would fail Deprecations: ^^^^^^^^^^^^^ +- `nprocs` (ACKTR) and `num_procs` (ACER) are deprecated in favor of `n_cpu_tf_sess` which is now common + to all algorithms Others: ^^^^^^^ @@ -508,4 +515,4 @@ In random order... Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @JohannesAck @EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol @XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs -@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 +@Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp diff --git a/setup.py b/setup.py index 7b8c29e7fb..d26afd3ce0 100644 --- a/setup.py +++ b/setup.py @@ -140,7 +140,7 @@ license="MIT", long_description=long_description, long_description_content_type='text/markdown', - version="2.8.0", + version="2.9.0a0", ) # python setup.py sdist diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py index e8b53ff51f..580e89ab32 100644 --- a/stable_baselines/__init__.py +++ b/stable_baselines/__init__.py @@ -20,4 +20,4 @@ from stable_baselines.trpo_mpi import TRPO del mpi4py -__version__ = "2.8.0" +__version__ = "2.9.0a0" diff --git a/stable_baselines/a2c/a2c.py b/stable_baselines/a2c/a2c.py index 1806ce03e6..12147dbc55 100644 --- a/stable_baselines/a2c/a2c.py +++ b/stable_baselines/a2c/a2c.py @@ -38,14 +38,21 @@ class A2C(ActorCriticRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.01, max_grad_norm=0.5, - learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + learning_rate=7e-4, alpha=0.99, epsilon=1e-5, lr_schedule='constant', verbose=0, + tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, + full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(A2C, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.n_steps = n_steps self.gamma = gamma @@ -99,7 +106,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.make_session(graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.n_batch = self.n_envs * self.n_steps @@ -216,15 +224,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ return policy_loss, value_loss, policy_entropy - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="A2C", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="A2C", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) - + self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) @@ -288,6 +295,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/a2c/run_atari.py b/stable_baselines/a2c/run_atari.py index 7b05e34d07..f8f7817ea1 100644 --- a/stable_baselines/a2c/run_atari.py +++ b/stable_baselines/a2c/run_atari.py @@ -30,8 +30,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) - model = A2C(policy_fn, env, lr_schedule=lr_schedule) - model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed) + model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py index 3f4af89432..951c0efb97 100644 --- a/stable_baselines/acer/acer_simple.py +++ b/stable_baselines/acer/acer_simple.py @@ -69,6 +69,10 @@ class ACER(ActorCriticRLModel): :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) :param num_procs: (int) The number of threads for TensorFlow operations + + .. deprecated:: 2.9.0 + Use `n_cpu_tf_sess` instead. + :param q_coef: (float) The weight for the loss on the Q value :param ent_coef: (float) The weight for the entropic loss :param max_grad_norm: (float) The clipping value for the maximum gradient @@ -93,16 +97,23 @@ class ACER(ActorCriticRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, + def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=7e-4, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + _init_setup_model=True, policy_kwargs=None, + full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): super(ACER, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.n_steps = n_steps self.replay_ratio = replay_ratio @@ -120,10 +131,14 @@ def __init__(self, policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, self.rprop_epsilon = rprop_epsilon self.learning_rate = learning_rate self.lr_schedule = lr_schedule - self.num_procs = num_procs self.tensorboard_log = tensorboard_log self.full_tensorboard_log = full_tensorboard_log + if num_procs is not None: + warnings.warn("num_procs will be removed in a future version (v3.x.x) " + "use n_cpu_tf_sess instead", DeprecationWarning) + self.n_cpu_tf_sess = num_procs + self.graph = None self.sess = None self.action_ph = None @@ -184,8 +199,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.make_session(num_cpu=self.num_procs, graph=self.graph) - + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) + self.set_random_seed(self.seed) n_batch_step = None if issubclass(self.policy, RecurrentActorCriticPolicy): n_batch_step = self.n_envs @@ -457,14 +472,14 @@ def _train_step(self, obs, actions, rewards, dones, mus, states, masks, steps, w return self.names_ops, step_return[1:] # strip off _train - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACER", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACER", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, schedule=self.lr_schedule) @@ -562,6 +577,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + 'n_cpu_tf_sess': self.n_cpu_tf_sess, + 'seed': self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/acer/run_atari.py b/stable_baselines/acer/run_atari.py index ec0e8b6870..c09788fe54 100644 --- a/stable_baselines/acer/run_atari.py +++ b/stable_baselines/acer/run_atari.py @@ -28,8 +28,8 @@ def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): warnings.warn("Policy {} not implemented".format(policy)) return - model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) - model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000, seed=seed) + model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() # Free memory del model diff --git a/stable_baselines/acktr/acktr.py b/stable_baselines/acktr/acktr.py index 702e98c67f..cefe73c8a8 100644 --- a/stable_baselines/acktr/acktr.py +++ b/stable_baselines/acktr/acktr.py @@ -1,4 +1,5 @@ import time +import warnings from collections import deque import numpy as np @@ -24,6 +25,10 @@ class ACKTR(ActorCriticRLModel): :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) :param gamma: (float) Discount factor :param nprocs: (int) The number of threads for TensorFlow operations + + .. deprecated:: 2.9.0 + Use `n_cpu_tf_sess` instead. + :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function @@ -43,15 +48,21 @@ class ACKTR(ActorCriticRLModel): If None (default), then the classic advantage will be used instead of GAE :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, + def __init__(self, policy, env, gamma=0.99, nprocs=None, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, async_eigen_decomp=False, kfac_update=1, - gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False): + gae_lambda=None, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): super(ACKTR, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.n_steps = n_steps self.gamma = gamma @@ -62,7 +73,12 @@ def __init__(self, policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, self.max_grad_norm = max_grad_norm self.learning_rate = learning_rate self.lr_schedule = lr_schedule - self.nprocs = nprocs + + if nprocs is not None: + warnings.warn("nprocs will be removed in a future version (v3.x.x) " + "use n_cpu_tf_sess instead", DeprecationWarning) + self.n_cpu_tf_sess = nprocs + self.tensorboard_log = tensorboard_log self.async_eigen_decomp = async_eigen_decomp self.full_tensorboard_log = full_tensorboard_log @@ -119,7 +135,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.make_session(num_cpu=self.nprocs, graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None @@ -264,14 +281,14 @@ def _train_step(self, obs, states, rewards, masks, actions, values, update, writ return policy_loss, value_loss, policy_entropy - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="ACKTR", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="ACKTR", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() self.n_batch = self.n_envs * self.n_steps self.learning_rate_schedule = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps, @@ -371,7 +388,6 @@ def save(self, save_path, cloudpickle=False): data = { "gamma": self.gamma, "gae_lambda": self.gae_lambda, - "nprocs": self.nprocs, "n_steps": self.n_steps, "vf_coef": self.vf_coef, "ent_coef": self.ent_coef, @@ -385,6 +401,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "kfac_update": self.kfac_update, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs diff --git a/stable_baselines/acktr/run_atari.py b/stable_baselines/acktr/run_atari.py index eac45056b4..694eb811dd 100644 --- a/stable_baselines/acktr/run_atari.py +++ b/stable_baselines/acktr/run_atari.py @@ -14,8 +14,8 @@ def train(env_id, num_timesteps, seed, num_cpu): :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) - model = ACKTR(CnnPolicy, env, nprocs=num_cpu) - model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) + model = ACKTR(CnnPolicy, env, nprocs=num_cpu, seed=seed) + model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close() diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py index 2f58ce776b..beec44d75b 100644 --- a/stable_baselines/common/base_class.py +++ b/stable_baselines/common/base_class.py @@ -30,9 +30,16 @@ class BaseRLModel(ABC): :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param requires_vec_env: (bool) Does this model require a vectorized environment :param policy_base: (BasePolicy) the base policy used by this method + :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base, policy_kwargs=None): + def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base, + policy_kwargs=None, seed=None, n_cpu_tf_sess=None): if isinstance(policy, str) and policy_base is not None: self.policy = get_policy_from_name(policy_base, policy) else: @@ -49,7 +56,9 @@ def __init__(self, policy, env, verbose=0, *, requires_vec_env, policy_base, pol self.graph = None self.sess = None self.params = None + self.seed = seed self._param_load_ops = None + self.n_cpu_tf_sess = n_cpu_tf_sess if env is not None: if isinstance(env, str): @@ -148,17 +157,35 @@ def setup_model(self): """ pass - def _setup_learn(self, seed): + def set_random_seed(self, seed): """ - check the environment, set the seed, and set the logger + :param seed: (int) Seed for the pseudo-random generators. If None, + do not change the seeds. + """ + # Ignore if the seed is None + if seed is None: + return + # Seed python, numpy and tf random generator + set_global_seeds(seed) + if self.env is not None: + if isinstance(self.env, VecEnv): + # Use a different seed for each env + for idx in range(self.env.num_envs): + self.env.env_method("seed", seed + idx) + else: + self.env.seed(seed) + # Seed the action space + # useful when selecting random actions + self.env.action_space.seed(seed) + self.action_space.seed(seed) - :param seed: (int) the seed value + def _setup_learn(self): + """ + Check the environment. """ if self.env is None: raise ValueError("Error: cannot train the model without a valid environment, please set an environment with" "set_env(self, env) method.") - if seed is not None: - set_global_seeds(seed) @abstractmethod def get_parameter_list(self): @@ -306,13 +333,12 @@ def pretrain(self, dataset, n_epochs=10, learning_rate=1e-4, return self @abstractmethod - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="run", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="run", reset_num_timesteps=True): """ Return a trained model. :param total_timesteps: (int) The total number of samples to train on - :param seed: (int) The initial seed for training, if None: keep current seed :param callback: (function (dict, dict)) -> boolean function called at every steps with state of the algorithm. It takes the local and global variables. If it returns False, training is aborted. :param log_interval: (int) The number of timesteps before logging. @@ -405,7 +431,7 @@ def load_parameters(self, load_path_or_dict, exact_match=True): else: # Assume a filepath or file-like. # Use existing deserializer to load the parameters. - # We only need the parameters part of the file, so + # We only need the parameters part of the file, so # only load that part. _, params = BaseRLModel._load_from_file(load_path_or_dict, load_data=False) @@ -523,7 +549,7 @@ def _save_to_file(save_path, data=None, params=None, cloudpickle=False): :param save_path: (str or file-like) Where to store the model :param data: (OrderedDict) Class parameters being stored :param params: (OrderedDict) Model parameters being stored - :param cloudpickle: (bool) Use old cloudpickle format + :param cloudpickle: (bool) Use old cloudpickle format (stable-baselines<=2.7.0) instead of a zip archive. """ if cloudpickle: @@ -559,8 +585,8 @@ def _load_from_file(load_path, load_data=True, custom_objects=None): :param load_path: (str or file-like) Where to load model from :param load_data: (bool) Whether we should load and return data - (class parameters). Mainly used by `load_parameters` to - only load model parameters (weights). + (class parameters). Mainly used by `load_parameters` to + only load model parameters (weights). :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a key, it will not be deserialized and the corresponding item @@ -688,12 +714,19 @@ class ActorCriticRLModel(BaseRLModel): :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param policy_base: (BasePolicy) the base policy used by this method (default=ActorCriticPolicy) :param requires_vec_env: (bool) Does this model require a vectorized environment + :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, _init_setup_model, verbose=0, policy_base=ActorCriticPolicy, - requires_vec_env=False, policy_kwargs=None): + requires_vec_env=False, policy_kwargs=None, seed=None, n_cpu_tf_sess=None): super(ActorCriticRLModel, self).__init__(policy, env, verbose=verbose, requires_vec_env=requires_vec_env, - policy_base=policy_base, policy_kwargs=policy_kwargs) + policy_base=policy_base, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.sess = None self.initial_state = None @@ -706,7 +739,7 @@ def setup_model(self): pass @abstractmethod - def learn(self, total_timesteps, callback=None, seed=None, + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="run", reset_num_timesteps=True): pass @@ -865,12 +898,20 @@ class OffPolicyRLModel(BaseRLModel): :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :param requires_vec_env: (bool) Does this model require a vectorized environment :param policy_base: (BasePolicy) the base policy used by this method + :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, replay_buffer=None, _init_setup_model=False, verbose=0, *, - requires_vec_env=False, policy_base=None, policy_kwargs=None): + requires_vec_env=False, policy_base=None, + policy_kwargs=None, seed=None, n_cpu_tf_sess=None): super(OffPolicyRLModel, self).__init__(policy, env, verbose=verbose, requires_vec_env=requires_vec_env, - policy_base=policy_base, policy_kwargs=policy_kwargs) + policy_base=policy_base, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.replay_buffer = replay_buffer @@ -879,7 +920,7 @@ def setup_model(self): pass @abstractmethod - def learn(self, total_timesteps, callback=None, seed=None, + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="run", reset_num_timesteps=True, replay_wrapper=None): pass @@ -938,6 +979,9 @@ def __init__(self, venv): super().__init__(venv) assert venv.num_envs == 1, "Error: cannot unwrap a environment wrapper that has more than one environment." + def seed(self, seed=None): + return self.venv.env_method('seed', seed) + def __getattr__(self, attr): if attr in self.__dict__: return getattr(self, attr) diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py index 62186d6707..2245181e52 100644 --- a/stable_baselines/common/distributions.py +++ b/stable_baselines/common/distributions.py @@ -8,8 +8,10 @@ class ProbabilityDistribution(object): """ - A particular probability distribution + Base class for describing a probability distribution. """ + def __init__(self): + super(ProbabilityDistribution, self).__init__() def flatparam(self): """ @@ -41,7 +43,7 @@ def kl(self, other): """ Calculates the Kullback-Leibler divergence from the given probabilty distribution - :param other: ([float]) the distibution to compare with + :param other: ([float]) the distribution to compare with :return: (float) the KL divergence of the two distributions """ raise NotImplementedError @@ -285,6 +287,7 @@ def __init__(self, logits): :param logits: ([float]) the categorical logits input """ self.logits = logits + super(CategoricalProbabilityDistribution, self).__init__() def flatparam(self): return self.logits @@ -344,6 +347,7 @@ def __init__(self, nvec, flat): """ self.flat = flat self.categoricals = list(map(CategoricalProbabilityDistribution, tf.split(flat, nvec, axis=-1))) + super(MultiCategoricalProbabilityDistribution, self).__init__() def flatparam(self): return self.flat @@ -386,6 +390,7 @@ def __init__(self, flat): self.mean = mean self.logstd = logstd self.std = tf.exp(logstd) + super(DiagGaussianProbabilityDistribution, self).__init__() def flatparam(self): return self.flat @@ -410,7 +415,8 @@ def entropy(self): def sample(self): # Bounds are taken into acount outside this class (during training only) # Otherwise, it changes the distribution and breaks PPO2 for instance - return self.mean + self.std * tf.random_normal(tf.shape(self.mean), dtype=self.mean.dtype) + return self.mean + self.std * tf.random_normal(tf.shape(self.mean), + dtype=self.mean.dtype) @classmethod def fromflat(cls, flat): @@ -432,6 +438,7 @@ def __init__(self, logits): """ self.logits = logits self.probabilities = tf.sigmoid(logits) + super(BernoulliProbabilityDistribution, self).__init__() def flatparam(self): return self.logits diff --git a/stable_baselines/common/policies.py b/stable_baselines/common/policies.py index b5bf4ed04b..d9e16cd092 100644 --- a/stable_baselines/common/policies.py +++ b/stable_baselines/common/policies.py @@ -227,9 +227,7 @@ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=Fals self._deterministic_action = None def _setup_init(self): - """ - sets up the distibutions, actions, and value - """ + """Sets up the distributions, actions, and value.""" with tf.variable_scope("output", reuse=True): assert self.policy is not None and self.proba_distribution is not None and self.value_fn is not None self._action = self.proba_distribution.sample() @@ -365,7 +363,7 @@ def dones_ph(self): def states_ph(self): """tf.Tensor: placeholder for states, shape (self.n_env, ) + state_shape.""" return self._states_ph - + @abstractmethod def value(self, obs, state=None, mask=None): """ diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py index 7036271960..3314044ca2 100644 --- a/stable_baselines/ddpg/ddpg.py +++ b/stable_baselines/ddpg/ddpg.py @@ -190,8 +190,12 @@ class DDPG(OffPolicyRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, nb_train_steps=50, nb_rollout_steps=100, nb_eval_steps=100, param_noise=None, action_noise=None, normalize_observations=False, tau=0.001, batch_size=128, param_noise_adaption_interval=50, @@ -199,11 +203,12 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n return_range=(-np.inf, np.inf), actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1., render=False, render_eval=False, memory_limit=None, buffer_size=50000, random_exploration=0.0, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False): + full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): super(DDPG, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DDPGPolicy, - requires_vec_env=False, policy_kwargs=policy_kwargs) + requires_vec_env=False, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) # Parameters. self.gamma = gamma @@ -320,7 +325,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.single_threaded_session(graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) @@ -616,7 +622,6 @@ def _policy(self, obs, apply_noise=True, compute_q=True): action = action.flatten() if self.action_noise is not None and apply_noise: noise = self.action_noise() - assert noise.shape == action.shape action += noise action = np.clip(action, -1, 1) return action, q_value @@ -797,7 +802,7 @@ def _reset(self): self.param_noise_stddev: self.param_noise.current_stddev, }) - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DDPG", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) @@ -807,7 +812,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_ with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] @@ -1088,6 +1093,8 @@ def save(self, save_path, cloudpickle=False): "random_exploration": self.random_exploration, "policy": self.policy, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/deepq/dqn.py b/stable_baselines/deepq/dqn.py index 637c9462c7..c577564d95 100644 --- a/stable_baselines/deepq/dqn.py +++ b/stable_baselines/deepq/dqn.py @@ -47,18 +47,23 @@ class DQN(OffPolicyRLModel): :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, double_q=True, learning_starts=1000, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, - prioritized_replay_eps=1e-6, param_noise=False, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + prioritized_replay_eps=1e-6, param_noise=False, + n_cpu_tf_sess=None, verbose=0, tensorboard_log=None, + _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None): # TODO: replay_buffer refactoring super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy, - requires_vec_env=False, policy_kwargs=policy_kwargs) + requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.param_noise = param_noise self.learning_starts = learning_starts @@ -117,7 +122,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.make_session(graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) @@ -142,14 +148,14 @@ def setup_model(self): self.summary = tf.summary.merge_all() - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() # Create the replay buffer if self.prioritized_replay: @@ -354,6 +360,8 @@ def save(self, save_path, cloudpickle=False): "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/gail/model.py b/stable_baselines/gail/model.py index 8e6989aa80..f5d6202284 100644 --- a/stable_baselines/gail/model.py +++ b/stable_baselines/gail/model.py @@ -48,7 +48,7 @@ def __init__(self, policy, env, expert_dataset=None, if _init_setup_model: self.setup_model() - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="GAIL", reset_num_timesteps=True): assert self.expert_dataset is not None, "You must pass an expert dataset to GAIL for training" - return super().learn(total_timesteps, callback, seed, log_interval, tb_log_name, reset_num_timesteps) + return super().learn(total_timesteps, callback, log_interval, tb_log_name, reset_num_timesteps) diff --git a/stable_baselines/her/her.py b/stable_baselines/her/her.py index 4215f9a2f7..6a9e89f43d 100644 --- a/stable_baselines/her/her.py +++ b/stable_baselines/her/her.py @@ -106,9 +106,9 @@ def _get_pretrain_placeholders(self): def setup_model(self): pass - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="HER", reset_num_timesteps=True): - return self.model.learn(total_timesteps, callback=callback, seed=seed, log_interval=log_interval, + return self.model.learn(total_timesteps, callback=callback, log_interval=log_interval, tb_log_name=tb_log_name, reset_num_timesteps=reset_num_timesteps, replay_wrapper=self.replay_wrapper) diff --git a/stable_baselines/ppo1/pposgd_simple.py b/stable_baselines/ppo1/pposgd_simple.py index 9154e74db0..b501c06bf2 100644 --- a/stable_baselines/ppo1/pposgd_simple.py +++ b/stable_baselines/ppo1/pposgd_simple.py @@ -41,15 +41,20 @@ class PPO1(ActorCriticRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5, - schedule='linear', verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, + policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1): super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.gamma = gamma self.timesteps_per_actorbatch = timesteps_per_actorbatch @@ -94,7 +99,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.single_threaded_session(graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) # Construct network for new policy self.policy_pi = self.policy(self.sess, self.observation_space, self.action_space, self.n_envs, 1, @@ -187,14 +193,14 @@ def setup_model(self): self.compute_losses = tf_util.function([obs_ph, old_pi.obs_ph, action_ph, atarg, ret, lrmult], losses) - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="PPO1", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." @@ -351,6 +357,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py index 86b69d9cd8..6d998d2d18 100644 --- a/stable_baselines/ppo2/ppo2.py +++ b/stable_baselines/ppo2/ppo2.py @@ -1,6 +1,5 @@ import time import sys -import multiprocessing from collections import deque import gym @@ -45,15 +44,20 @@ class PPO2(ActorCriticRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, n_steps=128, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, cliprange_vf=None, verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, - full_tensorboard_log=False): + full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(PPO2, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=True, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.learning_rate = learning_rate self.cliprange = cliprange @@ -113,13 +117,10 @@ def setup_model(self): self.n_batch = self.n_envs * self.n_steps - n_cpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': - n_cpu //= 2 - self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) n_batch_step = None n_batch_train = None @@ -302,7 +303,7 @@ def _train_step(self, learning_rate, cliprange, obs, returns, masks, actions, va return policy_loss, value_loss, policy_entropy, approxkl, clipfrac - def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_log_name="PPO2", + def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO2", reset_num_timesteps=True): # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) @@ -313,7 +314,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=1, tb_lo with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() runner = Runner(env=self.env, model=self, n_steps=self.n_steps, gamma=self.gamma, lam=self.lam) self.episode_reward = np.zeros((self.n_envs,)) @@ -419,6 +420,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/stable_baselines/sac/sac.py b/stable_baselines/sac/sac.py index 88779d1ffb..8712806e5f 100644 --- a/stable_baselines/sac/sac.py +++ b/stable_baselines/sac/sac.py @@ -1,6 +1,5 @@ import sys import time -import multiprocessing from collections import deque import warnings @@ -64,6 +63,11 @@ class SAC(OffPolicyRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on SAC logging for now + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, @@ -71,10 +75,12 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000 tau=0.005, ent_coef='auto', target_update_interval=1, gradient_steps=1, target_entropy='auto', action_noise=None, random_exploration=0.0, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, + seed=None, n_cpu_tf_sess=None): super(SAC, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, - policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs) + policy_base=SACPolicy, requires_vec_env=False, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate @@ -140,10 +146,8 @@ def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): - n_cpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': - n_cpu //= 2 - self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) @@ -361,7 +365,7 @@ def _train_step(self, step, writer, learning_rate): return policy_loss, qf1_loss, qf2_loss, value_loss, entropy - def learn(self, total_timesteps, callback=None, seed=None, + def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="SAC", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) @@ -372,7 +376,7 @@ def learn(self, total_timesteps, callback=None, seed=None, with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) @@ -546,6 +550,8 @@ def save(self, save_path, cloudpickle=False): "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, diff --git a/stable_baselines/td3/td3.py b/stable_baselines/td3/td3.py index 307c76bc24..eb0dd0fb3b 100644 --- a/stable_baselines/td3/td3.py +++ b/stable_baselines/td3/td3.py @@ -1,6 +1,5 @@ import sys import time -import multiprocessing from collections import deque import warnings @@ -53,17 +52,23 @@ class TD3(OffPolicyRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard Note: this has no effect on TD3 logging for now + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=100, gradient_steps=100, batch_size=128, tau=0.005, policy_delay=2, action_noise=None, target_policy_noise=0.2, target_noise_clip=0.5, random_exploration=0.0, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + _init_setup_model=True, policy_kwargs=None, + full_tensorboard_log=False, seed=None, n_cpu_tf_sess=None): super(TD3, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, - policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs) + policy_base=TD3Policy, requires_vec_env=False, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.buffer_size = buffer_size self.learning_rate = learning_rate @@ -122,10 +127,8 @@ def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): - n_cpu = multiprocessing.cpu_count() - if sys.platform == 'darwin': - n_cpu //= 2 - self.sess = tf_util.make_session(num_cpu=n_cpu, graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) @@ -273,7 +276,7 @@ def _train_step(self, step, writer, learning_rate, update_policy): return qf1_loss, qf2_loss - def learn(self, total_timesteps, callback=None, seed=None, + def learn(self, total_timesteps, callback=None, log_interval=4, tb_log_name="TD3", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) @@ -284,7 +287,7 @@ def learn(self, total_timesteps, callback=None, seed=None, with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() # Transform to callable if needed self.learning_rate = get_schedule_fn(self.learning_rate) @@ -464,6 +467,8 @@ def save(self, save_path, cloudpickle=False): "action_space": self.action_space, "policy": self.policy, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "action_noise": self.action_noise, "random_exploration": self.random_exploration, "_vectorize_action": self._vectorize_action, diff --git a/stable_baselines/trpo_mpi/trpo_mpi.py b/stable_baselines/trpo_mpi/trpo_mpi.py index 4da14c2240..53839ad66a 100644 --- a/stable_baselines/trpo_mpi/trpo_mpi.py +++ b/stable_baselines/trpo_mpi/trpo_mpi.py @@ -39,13 +39,19 @@ class TRPO(ActorCriticRLModel): :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation :param full_tensorboard_log: (bool) enable additional logging when using tensorboard WARNING: this logging can take a lot of space quickly + :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow). + If None (default), use random seed. Note that if you want completely deterministic + results, you must set `n_cpu_tf_sess` to 1. + :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations + If None, the number of cpu of the current machine will be used. """ - def __init__(self, policy, env, gamma=0.99, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, lam=0.98, entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, verbose=0, tensorboard_log=None, - _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False): + _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, + seed=None, n_cpu_tf_sess=1): super(TRPO, self).__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, - _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs) + _init_setup_model=_init_setup_model, policy_kwargs=policy_kwargs, + seed=seed, n_cpu_tf_sess=n_cpu_tf_sess) self.using_gail = False self.timesteps_per_batch = timesteps_per_batch @@ -118,7 +124,8 @@ def setup_model(self): self.graph = tf.Graph() with self.graph.as_default(): - self.sess = tf_util.single_threaded_session(graph=self.graph) + self.set_random_seed(self.seed) + self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) if self.using_gail: self.reward_giver = TransitionClassifier(self.observation_space, self.action_space, @@ -261,14 +268,14 @@ def allmean(arr): tf_util.function([observation, old_policy.obs_ph, action, atarg, ret], [self.summary, tf_util.flatgrad(optimgain, var_list)] + losses) - def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", + def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: - self._setup_learn(seed) + self._setup_learn() with self.sess.as_default(): seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_batch, @@ -513,6 +520,8 @@ def save(self, save_path, cloudpickle=False): "observation_space": self.observation_space, "action_space": self.action_space, "n_envs": self.n_envs, + "n_cpu_tf_sess": self.n_cpu_tf_sess, + "seed": self.seed, "_vectorize_action": self._vectorize_action, "policy_kwargs": self.policy_kwargs } diff --git a/tests/test_0deterministic.py b/tests/test_0deterministic.py new file mode 100644 index 0000000000..1ac6e855fd --- /dev/null +++ b/tests/test_0deterministic.py @@ -0,0 +1,35 @@ +import pytest + +from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO, TD3 +from stable_baselines.common.noise import NormalActionNoise + +N_STEPS_TRAINING = 5000 +SEED = 0 + +# Weird stuff: TD3 would fail if another algorithm is tested before +# with n_cpu_tf_sess > 1 +@pytest.mark.parametrize("algo", [A2C, ACKTR, ACER, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3]) +def test_deterministic_training_common(algo): + results = [[], []] + rewards = [[], []] + kwargs = {'n_cpu_tf_sess': 1} + if algo in [DDPG, TD3, SAC]: + env_id = 'Pendulum-v0' + kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) + else: + env_id = 'CartPole-v1' + if algo == DQN: + kwargs.update({'learning_starts': 100}) + + for i in range(2): + model = algo('MlpPolicy', env_id, seed=SEED, **kwargs) + model.learn(N_STEPS_TRAINING) + env = model.get_env() + obs = env.reset() + for _ in range(100): + action, _ = model.predict(obs, deterministic=False) + obs, reward, _, _ = env.step(action) + results[i].append(action) + rewards[i].append(reward) + assert sum(results[0]) == sum(results[1]), results + assert sum(rewards[0]) == sum(rewards[1]), rewards diff --git a/tests/test_action_space.py b/tests/test_action_space.py index d33f3f024a..783b4795cb 100644 --- a/tests/test_action_space.py +++ b/tests/test_action_space.py @@ -26,7 +26,7 @@ def test_identity_multidiscrete(model_class): env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_class("MlpPolicy", env) - model.learn(total_timesteps=1000, seed=0) + model.learn(total_timesteps=1000) n_trials = 1000 reward_sum = 0 @@ -55,7 +55,7 @@ def test_identity_multibinary(model_class): env = DummyVecEnv([lambda: IdentityEnvMultiBinary(10)]) model = model_class("MlpPolicy", env) - model.learn(total_timesteps=1000, seed=0) + model.learn(total_timesteps=1000) n_trials = 1000 reward_sum = 0 diff --git a/tests/test_continuous.py b/tests/test_continuous.py index 9563910b7b..29f03bfd3a 100644 --- a/tests/test_continuous.py +++ b/tests/test_continuous.py @@ -45,7 +45,7 @@ def test_model_manipulation(request, model_class): # create and train model = model_class(policy="MlpPolicy", env=env) - model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) + model.learn(total_timesteps=NUM_TIMESTEPS) # predict and measure the acc reward acc_reward = 0 @@ -115,7 +115,7 @@ def test_model_manipulation(request, model_class): "Error: the prediction seems to have changed between loading and saving" # learn post loading - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) # validate no reset post learning # This test was failing from time to time for no good reason diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index b68ae0e50c..725d88ffeb 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -77,7 +77,7 @@ def test_custom_policy(request, model_name): # create and train model = model_class(policy, env) - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) env = model.get_env() # predict and measure the acc reward @@ -119,7 +119,7 @@ def test_custom_policy_kwargs(request, model_name): # create and train model = model_class(policy, env, policy_kwargs=policy_kwargs) - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) model.save(model_fname) del model @@ -130,12 +130,12 @@ def test_custom_policy_kwargs(request, model_name): # Load with specifying policy_kwargs model = model_class.load(model_fname, policy=policy, env=env, policy_kwargs=policy_kwargs) - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) del model # Load without specifying policy_kwargs model = model_class.load(model_fname, policy=policy, env=env) - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) del model # Load with different wrong policy_kwargs diff --git a/tests/test_deterministic.py b/tests/test_deterministic.py deleted file mode 100644 index 9990663540..0000000000 --- a/tests/test_deterministic.py +++ /dev/null @@ -1,71 +0,0 @@ -import pytest - -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO -from stable_baselines.ddpg import AdaptiveParamNoiseSpec -from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox -from stable_baselines.common.vec_env import DummyVecEnv - -PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) - -# Hyperparameters for learning identity for each RL model -LEARN_FUNC_DICT = { - 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'dqn': lambda e: DQN(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG).learn(total_timesteps=1000), - 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'sac': lambda e: SAC(policy="MlpPolicy", env=e).learn(total_timesteps=1000), - 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), -} - - -@pytest.mark.slow -@pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) -def test_identity(model_name): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - - :param model_name: (str) Name of the RL model - """ - env = DummyVecEnv([lambda: IdentityEnv(10)]) - - model = LEARN_FUNC_DICT[model_name](env) - - n_trials = 1000 - obs = env.reset() - action_shape = model.predict(obs, deterministic=False)[0].shape - action, _ = model.predict(obs, deterministic=True) - assert action.shape == action_shape - for _ in range(n_trials): - new_action = model.predict(obs, deterministic=True)[0] - assert action == model.predict(obs, deterministic=True)[0] - assert new_action.shape == action_shape - # Free memory - del model, env - - -@pytest.mark.slow -@pytest.mark.parametrize("model_name", ['a2c', 'ddpg', 'ppo1', 'ppo2', 'sac', 'trpo']) -def test_identity_continuous(model_name): - """ - Test if the algorithm (with a given policy) - can learn an identity transformation (i.e. return observation as an action) - - :param model_name: (str) Name of the RL model - """ - env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) - - model = LEARN_FUNC_DICT[model_name](env) - - n_trials = 1000 - obs = env.reset() - action_shape = model.predict(obs, deterministic=False)[0].shape - action, _ = model.predict(obs, deterministic=True) - assert action.shape == action_shape - for _ in range(n_trials): - new_action = model.predict(obs, deterministic=True)[0] - assert action == model.predict(obs, deterministic=True)[0] - assert new_action.shape == action_shape diff --git a/tests/test_identity.py b/tests/test_identity.py index 8a7cd51d29..5fd18ad36b 100644 --- a/tests/test_identity.py +++ b/tests/test_identity.py @@ -11,19 +11,19 @@ # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, - gamma=0.7, env=e).learn(total_timesteps=10000, seed=0), - 'acer': lambda e: ACER(policy="MlpPolicy", env=e, - n_steps=1, replay_ratio=1).learn(total_timesteps=15000, seed=0), - 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, - learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000, seed=0), + gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), + 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, + n_steps=1, replay_ratio=1).learn(total_timesteps=15000), + 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, + learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, - exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0), - 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, lam=0.5, - optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000, seed=0), - 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, - learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000, seed=0), - 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, - max_kl=0.05, lam=0.7).learn(total_timesteps=10000, seed=0), + exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), + 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, + optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), + 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, + learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), + 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, + max_kl=0.05, lam=0.7).learn(total_timesteps=10000), } @@ -76,8 +76,9 @@ def test_identity_continuous(model_class): else: action_noise = None - model = model_class("MlpPolicy", env, gamma=0.1, action_noise=action_noise, buffer_size=int(1e6)) - model.learn(total_timesteps=20000, seed=0) + model = model_class("MlpPolicy", env, gamma=0.1, seed=0, + action_noise=action_noise, buffer_size=int(1e6)) + model.learn(total_timesteps=20000) n_trials = 1000 reward_sum = 0 diff --git a/tests/test_lstm_policy.py b/tests/test_lstm_policy.py index c6bc249fda..ff4cbb2c11 100644 --- a/tests/test_lstm_policy.py +++ b/tests/test_lstm_policy.py @@ -82,7 +82,7 @@ def test_lstm_policy(request, model_class, policy): model = model_class(policy, 'CartPole-v1', nminibatches=1) else: model = model_class(policy, 'CartPole-v1') - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) env = model.get_env() # predict and measure the acc reward @@ -123,7 +123,7 @@ def reward_callback(local, _): nonlocal eprewmeans eprewmeans.append(safe_mean([ep_info['r'] for ep_info in local['ep_info_buf']])) - model.learn(total_timesteps=100000, seed=0, callback=reward_callback) + model.learn(total_timesteps=100000, callback=reward_callback) # Maximum episode reward is 500. # In CartPole-v1, a non-recurrent policy can easily get >= 450. diff --git a/tests/test_save.py b/tests/test_save.py index 187d3742c1..28a5e21483 100644 --- a/tests/test_save.py +++ b/tests/test_save.py @@ -58,7 +58,7 @@ def test_model_manipulation(request, model_class, storage_method, store_format): # create and train model = model_class(policy="MlpPolicy", env=env) - model.learn(total_timesteps=50000, seed=0) + model.learn(total_timesteps=50000) # predict and measure the acc reward acc_reward = 0 @@ -119,7 +119,7 @@ def test_model_manipulation(request, model_class, storage_method, store_format): "loading and saving" # learn post loading - model.learn(total_timesteps=100, seed=0) + model.learn(total_timesteps=100) # validate no reset post learning loaded_acc_reward = 0