diff --git a/.github/ISSUE_TEMPLATE/issue-template.md b/.github/ISSUE_TEMPLATE/issue-template.md index d7045fe688..fb89edae50 100644 --- a/.github/ISSUE_TEMPLATE/issue-template.md +++ b/.github/ISSUE_TEMPLATE/issue-template.md @@ -10,6 +10,16 @@ If you have any questions, feel free to create an issue with the tag [question]. If you wish to suggest an enhancement or feature request, add the tag [feature request]. If you are submitting a bug report, please fill in the following details. +If your issue is related to a custom gym environment, please check it first using: + +```python +from stable_baselines.common.env_checker import check_env + +env = CustomEnv(arg1, ...) +# It will check your custom environment and output additional warnings if needed +check_env(env) +``` + **Describe the bug** A clear and concise description of what the bug is. diff --git a/.travis.yml b/.travis.yml index 3257911d74..eae10487a0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ python: env: global: - - DOCKER_IMAGE=stablebaselines/stable-baselines-cpu:v2.9.0 + - DOCKER_IMAGE=stablebaselines/stable-baselines-cpu:v3.0.0 notifications: email: false @@ -21,29 +21,36 @@ script: jobs: include: # Big test suite. Run in parallel to decrease wall-clock time, and to avoid OOM error from leaks - - stage: Test - name: "Unit Tests a-h" - env: TEST_GLOB="[a-h]*" - - - name: "Unit Tests i-l" - env: TEST_GLOB="[i-l]*" - - - name: "Unit Tests m-sa" - env: TEST_GLOB="{[m-r]*,sa*}" - - - name: "Unit Tests sb-z" - env: TEST_GLOB="{s[b-z]*,[t-z]*}" - - - name: "Unit Tests determinism" - env: TEST_GLOB="0deterministic.py" + # TODO: reactivate for tf2 + # - stage: Test + # name: "Unit Tests a-h" + # env: TEST_GLOB="[a-h]*" + # + # - name: "Unit Tests i-l" + # env: TEST_GLOB="[i-l]*" + # + # - name: "Unit Tests m-sa" + # env: TEST_GLOB="{[m-r]*,sa*}" + # + # - name: "Unit Tests sb-z" + # env: TEST_GLOB="{s[b-z]*,[t-z]*}" + # + # - name: "Unit Tests determinism" + # env: TEST_GLOB="0deterministic.py" - - name: "Sphinx Documentation" - script: - - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"' - - - name: "Type Checking" - script: - - 'docker run --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pytype"' + - stage: Test + name: "Unit Tests" + env: TEST_GLOB="*" + + # TODO: reactivate for tf2 + # - name: "Sphinx Documentation" + # script: + # - 'docker run -it --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pip install .[docs] && pushd docs/ && make clean && make html"' + + # TODO: reactivate for tf2 + # - name: "Type Checking" + # script: + # - 'docker run --rm --mount src=$(pwd),target=/root/code/stable-baselines,type=bind ${DOCKER_IMAGE} bash -c "cd /root/code/stable-baselines/ && pytype"' - stage: Codacy Trigger if: type != pull_request diff --git a/Dockerfile b/Dockerfile index 254b2bba59..a17496345c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,9 +34,9 @@ RUN \ cd $CODE_DIR && \ pip install --upgrade pip && \ if [[ $USE_GPU == "True" ]]; then \ - TENSORFLOW_PACKAGE="tensorflow-gpu==1.8.0"; \ + TENSORFLOW_PACKAGE="tensorflow-gpu"; \ else \ - TENSORFLOW_PACKAGE="tensorflow==1.8.0"; \ + TENSORFLOW_PACKAGE="tensorflow"; \ fi; \ pip install ${TENSORFLOW_PACKAGE} && \ pip install -e .[mpi,tests] && \ diff --git a/README.md b/README.md index 7925a89b4d..137f1e894c 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Documentation: https://stable-baselines.readthedocs.io/en/master/guide/rl_zoo.ht ## Installation -**Note:** Stabe-Baselines supports Tensorflow versions from 1.8.0 to 1.14.0. Support for Tensorflow 2 API is planned. +**Note:** Stabe-Baselines requires Tensorflow >= 2.x.x For older version, please look at branch Stable-Baselines < 3.x.x. ### Prerequisites Baselines requires python3 (>=3.5) with the development headers. You'll also need system packages CMake, OpenMPI and zlib. Those can be installed as follows diff --git a/docs/_static/img/mistake.png b/docs/_static/img/mistake.png new file mode 100644 index 0000000000..8fae18b599 Binary files /dev/null and b/docs/_static/img/mistake.png differ diff --git a/docs/common/env_checker.rst b/docs/common/env_checker.rst new file mode 100644 index 0000000000..404f6d6ac0 --- /dev/null +++ b/docs/common/env_checker.rst @@ -0,0 +1,7 @@ +.. _env_checker: + +Gym Environment Checker +======================== + +.. automodule:: stable_baselines.common.env_checker + :members: diff --git a/docs/common/schedules.rst b/docs/common/schedules.rst index dc545ae0a9..968a067601 100644 --- a/docs/common/schedules.rst +++ b/docs/common/schedules.rst @@ -3,8 +3,8 @@ Schedules ========= -Schedules are used as hyperparameter for most of the algortihms, -in order to change value of a parameter over time (usuallly the learning rate). +Schedules are used as hyperparameter for most of the algorithms, +in order to change value of a parameter over time (usually the learning rate). .. automodule:: stable_baselines.common.schedules diff --git a/docs/conf.py b/docs/conf.py index dfadbcc85e..f4768de100 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,14 @@ import sys from unittest.mock import MagicMock +# We CANNOT enable 'sphinxcontrib.spelling' because ReadTheDocs.org does not support +# PyEnchant. +try: + import sphinxcontrib.spelling + enable_spell_check = True +except ImportError: + enable_spell_check = False + # source code directory, relative to this file, for sphinx-autobuild sys.path.insert(0, os.path.abspath('..')) @@ -69,6 +77,9 @@ def __getattr__(cls, name): 'sphinx.ext.viewcode', ] +if enable_spell_check: + extensions.append('sphinxcontrib.spelling') + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/guide/custom_env.rst b/docs/guide/custom_env.rst index 02c8651c04..e3b91ab553 100644 --- a/docs/guide/custom_env.rst +++ b/docs/guide/custom_env.rst @@ -34,9 +34,13 @@ That is to say, your environment must implement the following methods (and inher def step(self, action): ... + return observation, reward, done, info def reset(self): ... - def render(self, mode='human', close=False): + return observation # reward, done, info can't be included + def render(self, mode='human'): + ... + def close (self): ... @@ -44,13 +48,28 @@ Then you can define and train a RL agent with: .. code-block:: python - # Instantiate and wrap the env - env = DummyVecEnv([lambda: CustomEnv(arg1, ...)]) + # Instantiate the env + env = CustomEnv(arg1, ...) # Define and Train the agent - model = A2C(CnnPolicy, env).learn(total_timesteps=1000) + model = A2C('CnnPolicy', env).learn(total_timesteps=1000) + + +To check that your environment follows the gym interface, please use: + +.. code-block:: python + + from stable_baselines.common.env_checker import check_env + + env = CustomEnv(arg1, ...) + # It will check your custom environment and output additional warnings if needed + check_env(env) + + +We have created a `colab notebook `_ for +a concrete example of creating a custom environment. -You can find a `complete guide online `_ +You can also find a `complete guide online `_ on creating a custom Gym environment. diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst index 0bb65d9e6a..c5d745feb7 100644 --- a/docs/guide/examples.rst +++ b/docs/guide/examples.rst @@ -7,6 +7,8 @@ Try it online with Colab Notebooks! All the following examples can be executed online using Google colab |colab| notebooks: +- `Full Tutorial `_ +- `All Notebooks `_ - `Getting Started`_ - `Training, Saving, Loading`_ - `Multiprocessing`_ @@ -16,14 +18,14 @@ notebooks: - `Hindsight Experience Replay`_ - `RL Baselines zoo`_ -.. _Getting Started: https://colab.research.google.com/drive/1_1H5bjWKYBVKbbs-Kj83dsfuZieDNcFU -.. _Training, Saving, Loading: https://colab.research.google.com/drive/16QritJF5kgT3mtnODepld1fo5tFnFCoc -.. _Multiprocessing: https://colab.research.google.com/drive/1ZzNFMUUi923foaVsYb4YjPy4mjKtnOxb -.. _Monitor Training and Plotting: https://colab.research.google.com/drive/1L_IMo6v0a0ALK8nefZm6PqPSy0vZIWBT -.. _Atari Games: https://colab.research.google.com/drive/1iYK11yDzOOqnrXi1Sfjm1iekZr4cxLaN -.. _Breakout: https://colab.research.google.com/drive/14NwwEHwN4hdNgGzzySjxQhEVDff-zr7O -.. _Hindsight Experience Replay: https://colab.research.google.com/drive/1VDD0uLi8wjUXIqAdLKiK15XaEe0z2FOc -.. _RL Baselines zoo: https://colab.research.google.com/drive/1cPGK3XrCqEs3QLqiijsfib9OFht3kObX +.. _Getting Started: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_getting_started.ipynb +.. _Training, Saving, Loading: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/saving_loading_dqn.ipynb +.. _Multiprocessing: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/multiprocessing_rl.ipynb +.. _Monitor Training and Plotting: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb +.. _Atari Games: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/atari_games.ipynb +.. _Breakout: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/breakout.ipynb +.. _Hindsight Experience Replay: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/stable_baselines_her.ipynb +.. _RL Baselines zoo: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/rl-baselines-zoo.ipynb .. |colab| image:: ../_static/img/colab.svg @@ -34,7 +36,7 @@ In the following example, we will train, save and load a DQN model on the Lunar .. image:: ../_static/img/try_it.png :scale: 30 % - :target: https://colab.research.google.com/drive/16QritJF5kgT3mtnODepld1fo5tFnFCoc + :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/saving_loading_dqn.ipynb .. figure:: https://cdn-images-1.medium.com/max/960/1*f4VZPKOI0PYNWiwt0la0Rg.gif @@ -89,7 +91,7 @@ Multiprocessing: Unleashing the Power of Vectorized Environments .. image:: ../_static/img/try_it.png :scale: 30 % - :target: https://colab.research.google.com/drive/1ZzNFMUUi923foaVsYb4YjPy4mjKtnOxb + :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/multiprocessing_rl.ipynb .. figure:: https://cdn-images-1.medium.com/max/960/1*h4WTQNVIsvMXJTCpXm_TAw.gif @@ -153,7 +155,7 @@ If your callback returns False, training is aborted early. .. image:: ../_static/img/try_it.png :scale: 30 % - :target: https://colab.research.google.com/drive/1L_IMo6v0a0ALK8nefZm6PqPSy0vZIWBT + :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb .. figure:: ../_static/img/learning_curve.png @@ -240,7 +242,7 @@ and multiprocessing for you. .. image:: ../_static/img/try_it.png :scale: 30 % - :target: https://colab.research.google.com/drive/1iYK11yDzOOqnrXi1Sfjm1iekZr4cxLaN + :target: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/atari_games.ipynb .. code-block:: python @@ -457,7 +459,7 @@ For this example, we are using `Highway-Env `_ for details. +Export to C++ +----------------- + +Tensorflow, which is the backbone of Stable Baselines, is fundamentally a C/C++ library despite being most commonly accessed +through the Python frontend layer. This design choice means that the models created at Python level should generally be +fully compliant with the respective C++ version of Tensorflow. + +.. warning:: + It is advisable not to mix-and-match different versions of Tensorflow libraries, particularly in terms of the state. + Moving computational graphs is generally more forgiving. As a matter of fact, mentioned below `PPO_CPP `_ project uses + graphs generated with Python Tensorflow 1.x in C++ Tensorflow 2 version. + +Stable Baselines comes very handily when hoping to migrate a computational graph and/or a state (weights) as +the existing algorithms define most of the necessary computations for you so you don't need to recreate the core of the algorithms again. +This is exactly the idea that has been used in the `PPO_CPP `_ project, which executes the training at the C++ level for the sake of +computational efficiency. The graphs are exported from Stable Baselines' PPO2 implementation through ``tf.train.export_meta_graph`` +function. Alternatively, and perhaps more commonly, you could use the C++ layer only for inference. That could be useful +as a deployment step of server backends or optimization for more limited devices. + +.. warning:: + As a word of caution, C++-level APIs are more imperative than their Python counterparts or more plainly speaking: cruder. + This is particularly apparent in Tensorflow 2.0 where the declarativeness of Autograph exists only at Python level. The + C++ counterpart still operates on Session objects' use, which are known from earlier versions of Tensorflow. In our use case, + availability of graphs utilized by Session depends on the use of ``tf.function`` decorators. However, as of November 2019, Stable Baselines still + uses Tensorflow 1.x in the main version which is slightly easier to use in the context of the C++ portability. + + Export to tensorflowjs / tfjs ----------------------------- diff --git a/docs/guide/install.rst b/docs/guide/install.rst index 39326daf27..134f4a7fe2 100644 --- a/docs/guide/install.rst +++ b/docs/guide/install.rst @@ -169,7 +169,7 @@ Explanation of the docker command: - ``--ipc=host`` Use the host system’s IPC namespace. IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores and message queues. -- ``--name test`` give explicitely the name ``test`` to the container, +- ``--name test`` give explicitly the name ``test`` to the container, otherwise it will be assigned a random name - ``--mount src=...`` give access of the local directory (``pwd`` command) to the container (it will be map to ``/root/code/stable-baselines``), so diff --git a/docs/guide/pretrain.rst b/docs/guide/pretrain.rst index b38e7d6fdc..788f91dbc6 100644 --- a/docs/guide/pretrain.rst +++ b/docs/guide/pretrain.rst @@ -80,7 +80,7 @@ The idea is that this callable can be a PID controller, asking a human player, . return env.action_space.sample() # Data will be saved in a numpy archive named `expert_cartpole.npz` # when using something different than an RL expert, - # you must pass the environment object explicitely + # you must pass the environment object explicitly generate_expert_traj(dummy_expert, 'dummy_expert_cartpole', env, n_episodes=10) diff --git a/docs/guide/rl.rst b/docs/guide/rl.rst index 3db01d41e1..ca9aeb1dac 100644 --- a/docs/guide/rl.rst +++ b/docs/guide/rl.rst @@ -12,4 +12,6 @@ However, if you want to learn about RL, there are several good resources to get - `OpenAI Spinning Up `_ - `David Silver's course `_ - `Lilian Weng's blog `_ +- `Berkeley's Deep RL Bootcamp `_ +- `Berkeley's Deep Reinforcement Learning course `_ - `More resources `_ diff --git a/docs/guide/rl_tips.rst b/docs/guide/rl_tips.rst new file mode 100644 index 0000000000..27bd3b9e8b --- /dev/null +++ b/docs/guide/rl_tips.rst @@ -0,0 +1,246 @@ +.. _rl_tips: + +====================================== +Reinforcement Learning Tips and Tricks +====================================== + +The aim of this section is to help you doing reinforcement learning experiments. +It covers general advice about RL (where to start, which algorithm to choose, how to evaluate an algorithm, ...), +as well as tips and tricks when using a custom environment or implementing an RL algorithm. + + +General advice when using Reinforcement Learning +================================================ + +TL;DR +----- + +1. Read about RL and Stable Baselines +2. Do quantitative experiments and hyperparameter tuning if needed +3. Evaluate the performance using a separate test environment +4. For better performance, increase the training budget + + +Like any other subject, if you want to work with RL, you should first read about it (we have a dedicated `ressource page `_ to get you started) +to understand what you are using. We also recommend you read Stable Baselines (SB) documentation and do the `tutorial `_. +It covers basic usage and guide you towards more advanced concepts of the library (e.g. callbacks and wrappers). + +Reinforcement Learning differs from other machine learning methods in several ways. The data used to train the agent is collected +through interactions with the environment by the agent itself (compared to supervised learning where you have a fixed dataset for instance). +This dependence can lead to vicious circle: if the agent collects poor quality data (e.g., trajectories with no rewards), then it will not improve and continue to amass +bad trajectories. + +This factor, among others, explains that results in RL may vary from one run to another (i.e., when only the seed of the pseudo-random generator changes). +For this reason, you should always do several runs to have quantitative results. + +Good results in RL are generally dependent on finding appropriate hyperparameters. Recent algorithms (PPO, SAC, TD3) normally require little hyperparameter tuning, +however, *don't expect the default ones to work* on any environment. + +Therefore, we *highly recommend you* to take a look at the `RL zoo `_ (or the original papers) for tuned hyperparameters. +A best practice when you apply RL to a new problem is to do automatic hyperparameter optimization. Again, this is included in the `RL zoo `_. + +When applying RL to a custom problem, you should always normalize the input to the agent (e.g. using VecNormalize for PPO2/A2C) +and look at common preprocessing done on other environments (e.g. for `Atari `_, frame-stack, ...). +Please refer to *Tips and Tricks when creating a custom environment* paragraph below for more advice related to custom environments. + + +Current Limitations of RL +------------------------- + +You have to be aware of the current `limitations `_ of reinforcement learning. + + +Model-free RL algorithms (i.e. all the algorithms implemented in SB) are usually *sample inefficient*. They require a lot of samples (sometimes millions of interactions) to learn something useful. +That's why most of the successes in RL were achieved on games or in simulation only. For instance, in this `work `_ by ETH Zurich, the ANYmal robot was trained in simulation only, and then tested in the real world. + +As a general advice, to obtain better performances, you should augment the budget of the agent (number of training timesteps). + + +In order to to achieved a desired behavior, expert knowledge is often required to design an adequate reward function. +This *reward engineering* (or *RewArt* as coined by `Freek Stulp `_), necessitates several iterations. As a good example of reward shaping, +you can take a look at `Deep Mimic paper `_ which combines imitation learning and reinforcement learning to do acrobatic moves. + +One last limitation of RL is the instability of training. That is to say, you can observe during training a huge drop in performance. +This behavior is particularly present in `DDPG`, that's why its extension `TD3` tries to tackle that issue. +Other method, like `TRPO` or `PPO` make use of a *trust region* to minimize that problem by avoiding too large update. + + +How to evaluate an RL algorithm? +-------------------------------- + +Because most algorithms use exploration noise during training, you need a separate test environment to evaluate the performance +of your agent at a given time. It is recommended to periodically evaluate your agent for `n` test episodes (`n` is usually between 5 and 20) +and average the reward per episode to have a good estimate. + +As some policy are stochastic by default (e.g. A2C or PPO), you should also try to set `deterministic=True` when calling the `.predict()` method, +this frequently leads to better performance. +Looking at the training curve (episode reward function of the timesteps) is a good proxy but underestimates the agent true performance. + + +We suggest you reading `Deep Reinforcement Learning that Matters `_ for a good discussion about RL evaluation. + +You can also take a look at this `blog post `_ +and this `issue `_ by Cédric Colas. + + +Which algorithm should I use? +============================= + +There is no silver bullet in RL, depending on your needs and problem, you may choose one or the other. +The first distinction comes from your action space, i.e., do you have discrete (e.g. LEFT, RIGHT, ...) +or continuous actions (ex: go to a certain speed)? + +Some algorithms are only tailored for one or the other domain: `DQN` only supports discrete actions, where `SAC` is restricted to continuous actions. + +The second difference that will help you choose is whether you can parallelize your training or not, and how you can do it (with or without MPI?). +If what matters is the wall clock training time, then you should lean towards `A2C` and its derivatives (PPO, ACER, ACKTR, ...). +Take a look at the `Vectorized Environments `_ to learn more about training with multiple workers. + +To sum it up: + +Discrete Actions +---------------- + +.. note:: + + This covers `Discrete`, `MultiDiscrete`, `Binary` and `MultiBinary` spaces + + +Discrete Actions - Single Process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +DQN with extensions (double DQN, prioritized replay, ...) and ACER are the recommended algorithms. +DQN is usually slower to train (regarding wall clock time) but is the most sample efficient (because of its replay buffer). + +Discrete Actions - Multiprocessed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You should give a try to PPO2, A2C and its successors (ACKTR, ACER). + +If you can multiprocess the training using MPI, then you should checkout PPO1 and TRPO. + + +Continuous Actions +------------------ + +Continuous Actions - Single Process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Current State Of The Art (SOTA) algorithms are `SAC` and `TD3`. +Please use the hyperparameters in the `RL zoo `_ for best results. + + +Continuous Actions - Multiprocessed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Take a look at PPO2, TRPO or A2C. Again, don't forget to take the hyperparameters from the `RL zoo `_ +for continuous actions problems (cf *Bullet* envs). + +.. note:: + + Normalization is critical for those algorithms + +If you can use MPI, then you can choose between PPO1, TRPO and DDPG. + + +Goal Environment +----------------- + +If your environment follows the `GoalEnv` interface (cf `HER <../modules/her.html>`_), then you should use +HER + (SAC/TD3/DDPG/DQN) depending on the action space. + + +.. note:: + + The number of workers is an important hyperparameters for experiments with HER. Currently, only HER+DDPG supports multiprocessing using MPI. + + + +Tips and Tricks when creating a custom environment +================================================== + +If you want to learn about how to create a custom environment, we recommend you read this `page `_. +We also provide a `colab notebook `_ for +a concrete example of creating a custom gym environment. + +Some basic advice: + +- always normalize your observation space when you can, i.e., when you know the boundaries +- normalize your action space and make it symmetric when continuous (cf potential issue below) A good practice is to rescale your actions to lie in [-1, 1]. This does not limit you as you can easily rescale the action inside the environment +- start with shaped reward (i.e. informative reward) and simplified version of your problem +- debug with random actions to check that your environment works and follows the gym interface: + + +We provide a helper to check that your environment runs without error: + +.. code-block:: python + + from stable_baselines.common.env_checker import check_env + + env = CustomEnv(arg1, ...) + # It will check your custom environment and output additional warnings if needed + check_env(env) + + +If you want to quickly try a random agent on your environment, you can also do: + +.. code-block:: python + + env = YourEnv() + obs = env.reset() + n_steps = 10 + for _ in range(n_steps): + # Random action + action = env.action_space.sample() + obs, reward, done, info = env.step(action) + + +**Why should I normalize the action space?** + + +Most reinforcement learning algorithms rely on a Gaussian distribution (initially centered at 0 with std 1) for continuous actions. +So, if you forget to normalize the action space when using a custom environment, +this can harm learning and be difficult to debug (cf attached image and `issue #473 `_). + +.. figure:: ../_static/img/mistake.png + + +Another consequence of using a Gaussian is that the action range is not bounded. +That's why clipping is usually used as a bandage to stay in a valid interval. +A better solution would be to use a squashing function (cf `SAC`) or a Beta distribution (cf `issue #112 `_). + +.. note:: + + This statement is not true for `DDPG` or `TD3` because they don't rely on any probability distribution. + + + +Tips and Tricks when implementing an RL algorithm +================================================= + +When you try to reproduce a RL paper by implementing the algorithm, the `nuts and bolts of RL research `_ +by John Schulman are quite useful (`video `_). + +We *recommend following those steps to have a working RL algorithm*: + +1. Read the original paper several times +2. Read existing implementations (if available) +3. Try to have some "sign of life" on toy problems +4. Validate the implementation by making it run on harder and harder envs (you can compare results against the RL zoo) + You usually need to run hyperparameter optimization for that step. + +You need to be particularly careful on the shape of the different objects you are manipulating (a broadcast mistake will fail silently cf `issue #75 `_) +and when to stop the gradient propagation. + +A personal pick (by @araffin) for environments with gradual difficulty in RL with continuous actions: + +1. Pendulum (easy to solve) +2. HalfCheetahBullet (medium difficulty with local minima and shaped reward) +3. BipedalWalkerHardcore (if it works on that one, then you can have a cookie) + +in RL with discrete actions: + +1. CartPole-v1 (easy to be better than random agent, harder to achieve maximal performance) +2. LunarLander +3. Pong (one of the easiest Atari game) +4. other Atari games (e.g. Breakout) diff --git a/docs/guide/rl_zoo.rst b/docs/guide/rl_zoo.rst index 1abef343ec..61c4d15b2e 100644 --- a/docs/guide/rl_zoo.rst +++ b/docs/guide/rl_zoo.rst @@ -99,7 +99,7 @@ with a budget of 1000 trials and a maximum of 50000 steps: Colab Notebook: Try it Online! ------------------------------ -You can train agents online using Google `colab notebook `_. +You can train agents online using Google `colab notebook `_. .. note:: diff --git a/docs/index.rst b/docs/index.rst index 0a137c7976..4975e3573a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -39,6 +39,7 @@ This toolset is a fork of OpenAI Baselines, with a major structural refactoring, guide/install guide/quickstart + guide/rl_tips guide/rl guide/algos guide/examples @@ -81,6 +82,7 @@ This toolset is a fork of OpenAI Baselines, with a major structural refactoring, common/cmd_utils common/schedules common/evaluation + common/env_checker .. toctree:: :maxdepth: 1 diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst index 1c7ef62c44..8597960b87 100644 --- a/docs/misc/changelog.rst +++ b/docs/misc/changelog.rst @@ -5,10 +5,37 @@ Changelog For download links, please look at `Github release page `_. +Pre-Release 3.0.0a0 (WIP) +-------------------------- + +**TensorFlow 2 Version** -Pre-Release 2.9.0a0 (WIP) +Breaking Changes: +^^^^^^^^^^^^^^^^^ +- Drop support for tensorflow 1.x, TensorFlow >=2.0.0 is required +- New dependency: tensorflow-probability>=0.8.0 is now required + +New Features: +^^^^^^^^^^^^^ + +Bug Fixes: +^^^^^^^^^^ + +Deprecations: +^^^^^^^^^^^^^ + +Others: +^^^^^^^ + +Documentation: +^^^^^^^^^^^^^^ + + +Release 2.9.0 (2019-12-20) -------------------------- +*Reproducible results, automatic `VecEnv` wrapping, env checker and more usability improvements* + Breaking Changes: ^^^^^^^^^^^^^^^^^ - The `seed` argument has been moved from `learn()` method to model constructor @@ -16,6 +43,7 @@ Breaking Changes: - `allow_early_resets` of the `Monitor` wrapper now default to `True` - `make_atari_env` now returns a `DummyVecEnv` by default (instead of a `SubprocVecEnv`) this usually improves performance. +- Fix inconsistency of sample type, so that mode/sample function returns tensor of tf.int64 in CategoricalProbabilityDistribution/MultiCategoricalProbabilityDistribution (@seheevic) New Features: ^^^^^^^^^^^^^ @@ -23,10 +51,19 @@ New Features: - Environments are automatically wrapped in a `DummyVecEnv` if needed when passing them to the model constructor - Added `stable_baselines.common.make_vec_env` helper to simplify VecEnv creation - Added `stable_baselines.common.evaluation.evaluate_policy` helper to simplify model evaluation -- `VecNormalize` now supports being pickled and unpickled. +- `VecNormalize` changes: + + - Now supports being pickled and unpickled (@AdamGleave). + - New methods `.normalize_obs(obs)` and `normalize_reward(rews)` apply normalization + to arbitrary observation or rewards without updating statistics (@shwang) + - `.get_original_reward()` returns the unnormalized rewards from the most recent timestep + - `.reset()` now collects observation statistics (used to only apply normalization) + - Add parameter `exploration_initial_eps` to DQN. (@jdossgollin) - Add type checking and PEP 561 compliance. Note: most functions are still not annotated, this will be a gradual process. +- DDPG, TD3 and SAC accept non-symmetric action spaces. (@Antymon) +- Add `check_env` util to check if a custom environment follows the gym interface (@araffin and @justinkterry) Bug Fixes: ^^^^^^^^^^ @@ -34,6 +71,7 @@ Bug Fixes: - Fix a bug in DDPG where `predict` method with `deterministic=False` would fail - Fix a bug in TRPO: mean_losses was not initialized causing the logger to crash when there was no gradients (@MarvineGothic) - Fix a bug in `cmd_util` from API change in recent Gym versions +- Fix a bug in DDPG, TD3 and SAC where warmup and random exploration actions would end up scaled in the replay buffer (@Antymon) Deprecations: ^^^^^^^^^^^^^ @@ -46,6 +84,11 @@ Others: - Add upper bound for Tensorflow version (<2.0.0). - Refactored test to remove duplicated code - Add pull request template +- Replaced redundant code in load_results (@jbulow) +- Minor PEP8 fixes in dqn.py (@justinkterry) +- Add a message to the assert in `PPO2` +- Update replay buffer doctring +- Fix `VecEnv` docstrings Documentation: ^^^^^^^^^^^^^^ @@ -59,7 +102,18 @@ Documentation: - Add Pwnagotchi project (@evilsocket) - Fix multiprocessing example (@rusu24edward) - Fix `result_plotter` example +- Add JNRR19 tutorial (by @edbeeching, @hill-a and @araffin) +- Updated notebooks link - Fix typo in algos.rst, "containes" to "contains" (@SyllogismRXS) +- Fix outdated source documentation for load_results +- Add PPO_CPP project (@Antymon) +- Add section on C++ portability of Tensorflow models (@Antymon) +- Update custom env documentation to reflect new gym API for the `close()` method (@justinkterry) +- Update custom env documentation to clarify what step and reset return (@justinkterry) +- Add RL tips and tricks for doing RL experiments +- Corrected lots of typos +- Add spell check to documentation if available + Release 2.8.0 (2019-09-29) -------------------------- @@ -374,7 +428,7 @@ Release 2.1.1 (2018-10-20) -------------------------- - fixed MpiAdam synchronization issue in PPO1 (thanks to @brendenpetersen) issue #50 -- fixed dependency issues (new mujoco-py requires a mujoco licence + gym broke MultiDiscrete space shape) +- fixed dependency issues (new mujoco-py requires a mujoco license + gym broke MultiDiscrete space shape) Release 2.1.0 (2018-10-2) @@ -540,4 +594,4 @@ Thanks to @bjmuld @iambenzo @iandanforth @r7vme @brendenpetersen @huvar @abhiskk @EliasHasle @mrakgr @Bleyddyn @antoine-galataud @junhyeokahn @AdamGleave @keshaviyengar @tperol @XMaster96 @kantneel @Pastafarianist @GerardMaggiolino @PatrickWalter214 @yutingsz @sc420 @Aaahh @billtubbs @Miffyli @dwiel @miguelrass @qxcv @jaberkow @eavelardev @ruifeng96150 @pedrohbtp @srivatsankrishnan @evilsocket -@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward +@MarvineGothic @jdossgollin @SyllogismRXS @rusu24edward @jbulow @Antymon @seheevic @justinkterry @edbeeching diff --git a/docs/misc/projects.rst b/docs/misc/projects.rst index 60899952cb..44607dfb1b 100644 --- a/docs/misc/projects.rst +++ b/docs/misc/projects.rst @@ -168,3 +168,13 @@ this study are from stable-baselines. | Email: srivatsan@seas.harvard.edu | Github: https://github.com/harvard-edge/quarl | Paper: https://arxiv.org/pdf/1910.01055.pdf + + +PPO_CPP: C++ version of a Deep Reinforcement Learning algorithm PPO +------------------------------------------------------------------- +Executes PPO at C++ level yielding notable execution performance speedups. +Uses Stable Baselines to create a computational graph which is then used for training with custom environments by machine-code-compiled binary. + +| Authors: Szymon Brych +| Email: szymon.brych@gmail.com +| GitHub: https://github.com/Antymon/ppo_cpp diff --git a/docs/modules/her.rst b/docs/modules/her.rst index 8539dfaf9f..e64cd7eda6 100644 --- a/docs/modules/her.rst +++ b/docs/modules/her.rst @@ -93,7 +93,7 @@ Goal Selection Strategies :undoc-members: -Gaol Env Wrapper +Goal Env Wrapper ---------------- .. autoclass:: HERGoalEnvWrapper diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt new file mode 100644 index 0000000000..046d206451 --- /dev/null +++ b/docs/spelling_wordlist.txt @@ -0,0 +1,108 @@ +py +env +atari +argparse +Argparse +TensorFlow +feedforward +envs +VecEnv +pretrain +petrained +tf +np +mujoco +cpu +ndarray +ndarrays +timestep +timesteps +stepsize +dataset +adam +fn +normalisation +Kullback +Leibler +boolean +deserialized +pretrained +minibatch +subprocesses +ArgumentParser +Tensorflow +Gaussian +approximator +minibatches +hyperparameters +hyperparameter +vectorized +rl +colab +dataloader +npz +datasets +vf +logits +num +Utils +backpropagate +prepend +NaN +preprocessing +Cloudpickle +async +multiprocess +tensorflow +mlp +cnn +neglogp +tanh +coef +repo +Huber +params +ppo +arxiv +Arxiv +func +DQN +Uhlenbeck +Ornstein +multithread +cancelled +Tensorboard +parallelize +customising +serializable +Multiprocessed +cartpole +toolset +lstm +rescale +ffmpeg +avconv +unnormalized +Github +pre +preprocess +backend +attr +preprocess +Antonin +Raffin +araffin +Homebrew +Numpy +Theano +rollout +kfac +Piecewise +csv +nvidia +visdom +tensorboard +preprocessed +namespace +sklearn +GoalEnv diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index 9a4bf73e1d..1f577a0cd3 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -4,7 +4,7 @@ CPU_PARENT=ubuntu:16.04 GPU_PARENT=nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 TAG=stablebaselines/stable-baselines -VERSION=v2.9.0 +VERSION=v3.0.0 if [[ ${USE_GPU} == "True" ]]; then PARENT=${GPU_PARENT} @@ -14,4 +14,3 @@ else fi docker build --build-arg PARENT_IMAGE=${PARENT} -t ${TAG}:${VERSION} . - diff --git a/setup.cfg b/setup.cfg index feff06ffcd..8dae34a7a3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,6 +13,7 @@ filterwarnings = ignore:builtin type EagerTensor has no __module__ attribute:DeprecationWarning ignore:The binary mode of fromstring is deprecated:DeprecationWarning ignore::FutureWarning:tensorflow + ignore:the imp module is deprecated # Gym warnings ignore:Parameters to load are deprecated.:DeprecationWarning ignore:the imp module is deprecated in favour of importlib:PendingDeprecationWarning diff --git a/setup.py b/setup.py index 4d14eca23e..9ff0eeddf2 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ install_tf, tf_gpu = False, False try: import tensorflow as tf - if tf.__version__ < LooseVersion('1.8.0'): + if tf.__version__ < LooseVersion('2.0.0'): install_tf = True # check if a gpu version is needed tf_gpu = tf.test.is_gpu_available() @@ -29,7 +29,7 @@ tf_dependency = [] if install_tf: - tf_dependency = ['tensorflow-gpu>=1.8.0,<2.0.0'] if tf_gpu else ['tensorflow>=1.8.0,<2.0.0'] + tf_dependency = ['tensorflow-gpu>=2.0.0'] if tf_gpu else ['tensorflow>=2.0.0'] if tf_gpu: print("A GPU was detected, tensorflow-gpu will be installed") @@ -118,7 +118,8 @@ 'opencv-python', 'numpy', 'pandas', - 'matplotlib' + 'matplotlib', + 'tensorflow-probability>=0.8.0' ] + tf_dependency, extras_require={ 'mpi': [ @@ -146,7 +147,7 @@ license="MIT", long_description=long_description, long_description_content_type='text/markdown', - version="2.9.0a0", + version="3.0.0a0", ) # python setup.py sdist diff --git a/stable_baselines/__init__.py b/stable_baselines/__init__.py index 580e89ab32..a0ca572812 100644 --- a/stable_baselines/__init__.py +++ b/stable_baselines/__init__.py @@ -1,23 +1,24 @@ -from stable_baselines.a2c import A2C -from stable_baselines.acer import ACER -from stable_baselines.acktr import ACKTR -from stable_baselines.deepq import DQN -from stable_baselines.her import HER -from stable_baselines.ppo2 import PPO2 -from stable_baselines.td3 import TD3 -from stable_baselines.sac import SAC +# from stable_baselines.a2c import A2C +# from stable_baselines.acer import ACER +# from stable_baselines.acktr import ACKTR +# from stable_baselines.deepq import DQN +# from stable_baselines.her import HER +# from stable_baselines.ppo2 import PPO2 +# from stable_baselines.td3 import TD3 +# from stable_baselines.sac import SAC # Load mpi4py-dependent algorithms only if mpi is installed. try: import mpi4py + import mpi4py.MPI except ImportError: mpi4py = None +# +# if mpi4py is not None: +# from stable_baselines.ddpg import DDPG +# from stable_baselines.gail import GAIL +# from stable_baselines.ppo1 import PPO1 +# from stable_baselines.trpo_mpi import TRPO +# del mpi4py -if mpi4py is not None: - from stable_baselines.ddpg import DDPG - from stable_baselines.gail import GAIL - from stable_baselines.ppo1 import PPO1 - from stable_baselines.trpo_mpi import TRPO -del mpi4py - -__version__ = "2.9.0a0" +__version__ = "3.0.0a0" diff --git a/stable_baselines/a2c/utils.py b/stable_baselines/a2c/utils.py index b4be1964db..8b52e433c7 100644 --- a/stable_baselines/a2c/utils.py +++ b/stable_baselines/a2c/utils.py @@ -493,7 +493,7 @@ def get_by_index(input_tensor, idx): """ assert len(input_tensor.get_shape()) == 2 assert len(idx.get_shape()) == 1 - idx_flattened = tf.range(0, input_tensor.shape[0]) * input_tensor.shape[1] + idx + idx_flattened = tf.range(0, input_tensor.shape[0], dtype=idx.dtype) * input_tensor.shape[1] + idx offset_tensor = tf.gather(tf.reshape(input_tensor, [-1]), # flatten input idx_flattened) # use flattened indices return offset_tensor diff --git a/stable_baselines/acer/acer_simple.py b/stable_baselines/acer/acer_simple.py index c9b3694540..043f353b84 100644 --- a/stable_baselines/acer/acer_simple.py +++ b/stable_baselines/acer/acer_simple.py @@ -75,7 +75,7 @@ class ACER(ActorCriticRLModel): Use `n_cpu_tf_sess` instead. :param q_coef: (float) The weight for the loss on the Q value - :param ent_coef: (float) The weight for the entropic loss + :param ent_coef: (float) The weight for the entropy loss :param max_grad_norm: (float) The clipping value for the maximum gradient :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', @@ -390,13 +390,13 @@ def custom_getter(getter, name, *args, **kwargs): tf.summary.scalar('rewards', tf.reduce_mean(self.reward_ph)) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate)) tf.summary.scalar('advantage', tf.reduce_mean(adv)) - tf.summary.scalar('action_probabilty', tf.reduce_mean(self.mu_ph)) + tf.summary.scalar('action_probability', tf.reduce_mean(self.mu_ph)) if self.full_tensorboard_log: tf.summary.histogram('rewards', self.reward_ph) tf.summary.histogram('learning_rate', self.learning_rate) tf.summary.histogram('advantage', adv) - tf.summary.histogram('action_probabilty', self.mu_ph) + tf.summary.histogram('action_probability', self.mu_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) else: @@ -638,7 +638,7 @@ def run(self): """ Run a step leaning of the model - :return: ([float], [float], [float], [float], [float], [bool], [float]) + :return: ([float], [float], [int64], [float], [float], [bool], [float]) encoded observation, observations, actions, rewards, mus, dones, masks """ enc_obs = [self.obs] @@ -666,7 +666,7 @@ def run(self): enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) - mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) + mb_actions = np.asarray(mb_actions, dtype=np.int64).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) diff --git a/stable_baselines/acktr/acktr.py b/stable_baselines/acktr/acktr.py index 8f447472e3..3499c3f47d 100644 --- a/stable_baselines/acktr/acktr.py +++ b/stable_baselines/acktr/acktr.py @@ -30,7 +30,7 @@ class ACKTR(ActorCriticRLModel): Use `n_cpu_tf_sess` instead. :param n_steps: (int) The number of steps to run for each environment - :param ent_coef: (float) The weight for the entropic loss + :param ent_coef: (float) The weight for the entropy loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer diff --git a/stable_baselines/acktr/kfac.py b/stable_baselines/acktr/kfac.py index 4984b1dba1..4ab208056e 100644 --- a/stable_baselines/acktr/kfac.py +++ b/stable_baselines/acktr/kfac.py @@ -25,7 +25,7 @@ def __init__(self, learning_rate=0.01, momentum=0.9, clip_kl=0.01, kfac_update=2 :param clip_kl: (float) gradient clipping for Kullback-Leibler :param kfac_update: (int) update kfac after kfac_update steps :param stats_accum_iter: (int) how may steps to accumulate stats - :param full_stats_init: (bool) whether or not to fully initalize stats + :param full_stats_init: (bool) whether or not to fully initialize stats :param cold_iter: (int) Cold start learning rate for how many steps :param cold_lr: (float) Cold start learning rate :param async_eigen_decomp: (bool) Use async eigen decomposition diff --git a/stable_baselines/bench/monitor.py b/stable_baselines/bench/monitor.py index 84bcf87aac..fd9542b0ba 100644 --- a/stable_baselines/bench/monitor.py +++ b/stable_baselines/bench/monitor.py @@ -160,13 +160,13 @@ def get_monitor_files(path): def load_results(path): """ - Load results from a given file + Load all Monitor logs from a given directory path matching ``*monitor.csv`` and ``*monitor.json`` - :param path: (str) the path to the log file + :param path: (str) the directory path containing the log file(s) :return: (Pandas DataFrame) the logged data """ # get both csv and (old) json files - monitor_files = (glob(os.path.join(path, "*monitor.json")) + glob(os.path.join(path, "*monitor.csv"))) + monitor_files = (glob(os.path.join(path, "*monitor.json")) + get_monitor_files(path)) if not monitor_files: raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, path)) data_frames = [] diff --git a/stable_baselines/common/__init__.py b/stable_baselines/common/__init__.py index 7087980e7d..d5f61371e6 100644 --- a/stable_baselines/common/__init__.py +++ b/stable_baselines/common/__init__.py @@ -4,6 +4,6 @@ from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, \ explained_variance_2d, flatten_arrays, unflatten_vector from stable_baselines.common.misc_util import zipsame, set_global_seeds, boolean_flag -from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \ - TensorboardWriter -from stable_baselines.common.cmd_util import make_vec_env +# from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \ +# TensorboardWriter +# from stable_baselines.common.cmd_util import make_vec_env diff --git a/stable_baselines/common/atari_wrappers.py b/stable_baselines/common/atari_wrappers.py index 97f59bb4e0..ee8579b25e 100644 --- a/stable_baselines/common/atari_wrappers.py +++ b/stable_baselines/common/atari_wrappers.py @@ -276,7 +276,7 @@ def __getitem__(self, i): def make_atari(env_id): """ - Create a wrapped atari envrionment + Create a wrapped atari Environment :param env_id: (str) the environment ID :return: (Gym Environment) the wrapped atari environment diff --git a/stable_baselines/common/base_class.py b/stable_baselines/common/base_class.py index e90bf9a0b1..1f34c68d0b 100644 --- a/stable_baselines/common/base_class.py +++ b/stable_baselines/common/base_class.py @@ -238,9 +238,9 @@ def _get_pretrain_placeholders(self): """ Return the placeholders needed for the pretraining: - obs_ph: observation placeholder - - actions_ph will be population with an action from the environement + - actions_ph will be population with an action from the environment (from the expert dataset) - - deterministic_actions_ph: e.g., in the case of a gaussian policy, + - deterministic_actions_ph: e.g., in the case of a Gaussian policy, the mean. :return: ((tf.placeholder)) (obs_ph, actions_ph, deterministic_actions_ph) @@ -474,7 +474,7 @@ def load(cls, load_path, env=None, custom_objects=None, **kwargs): Load the model from file :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Envrionment) the new environment to run the loaded model on + :param env: (Gym Environment) the new environment to run the loaded model on (can be None if you only need prediction from a trained model) :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a @@ -862,7 +862,7 @@ def load(cls, load_path, env=None, custom_objects=None, **kwargs): Load the model from file :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Envrionment) the new environment to run the loaded model on + :param env: (Gym Environment) the new environment to run the loaded model on (can be None if you only need prediction from a trained model) :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a @@ -945,7 +945,7 @@ def load(cls, load_path, env=None, custom_objects=None, **kwargs): Load the model from file :param load_path: (str or file-like) the saved parameter location - :param env: (Gym Envrionment) the new environment to run the loaded model on + :param env: (Gym Environment) the new environment to run the loaded model on (can be None if you only need prediction from a trained model) :param custom_objects: (dict) Dictionary of objects to replace upon loading. If a variable is present in this dictionary as a diff --git a/stable_baselines/common/cmd_util.py b/stable_baselines/common/cmd_util.py index 2883821019..c5ff663391 100644 --- a/stable_baselines/common/cmd_util.py +++ b/stable_baselines/common/cmd_util.py @@ -25,7 +25,7 @@ def make_vec_env(env_id, n_envs=1, seed=None, start_index=0, :param env_id: (str or Type[gym.Env]) the environment ID or the environment class :param n_envs: (int) the number of environments you wish to have in parallel - :param seed: (int) the inital seed for the random number generator + :param seed: (int) the initial seed for the random number generator :param start_index: (int) start rank index :param monitor_dir: (str) Path to a folder where the monitor files will be saved. If None, no file will be written, however, the env will still be wrapped @@ -80,7 +80,7 @@ def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, :param env_id: (str) the environment ID :param num_env: (int) the number of environment you wish to have in subprocesses - :param seed: (int) the inital seed for RNG + :param seed: (int) the initial seed for RNG :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function :param start_index: (int) start rank index :param allow_early_resets: (bool) allows early reset of the environment @@ -116,7 +116,7 @@ def make_mujoco_env(env_id, seed, allow_early_resets=True): Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID - :param seed: (int) the inital seed for RNG + :param seed: (int) the initial seed for RNG :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The mujoco environment """ @@ -132,7 +132,7 @@ def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID - :param seed: (int) the inital seed for RNG + :param seed: (int) the initial seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment diff --git a/stable_baselines/common/distributions.py b/stable_baselines/common/distributions.py index 2245181e52..b64e0a5b5c 100644 --- a/stable_baselines/common/distributions.py +++ b/stable_baselines/common/distributions.py @@ -17,7 +17,7 @@ def flatparam(self): """ Return the direct probabilities - :return: ([float]) the probabilites + :return: ([float]) the probabilities """ raise NotImplementedError @@ -41,7 +41,7 @@ def neglogp(self, x): def kl(self, other): """ - Calculates the Kullback-Leibler divergence from the given probabilty distribution + Calculates the Kullback-Leibler divergence from the given probability distribution :param other: ([float]) the distribution to compare with :return: (float) the KL divergence of the two distributions @@ -50,7 +50,7 @@ def kl(self, other): def entropy(self): """ - Returns shannon's entropy of the probability + Returns Shannon's entropy of the probability :return: (float) the entropy """ @@ -58,7 +58,7 @@ def entropy(self): def sample(self): """ - returns a sample from the probabilty distribution + returns a sample from the probability distribution :return: (Tensorflow Tensor) the stochastic action """ @@ -103,8 +103,8 @@ def proba_distribution_from_latent(self, pi_latent_vector, vf_latent_vector, ini :param pi_latent_vector: ([float]) the latent pi values :param vf_latent_vector: ([float]) the latent vf values - :param init_scale: (float) the inital scale of the distribution - :param init_bias: (float) the inital bias of the distribution + :param init_scale: (float) the initial scale of the distribution + :param init_bias: (float) the initial bias of the distribution :return: (ProbabilityDistribution) the instance of the ProbabilityDistribution associated """ raise NotImplementedError @@ -178,7 +178,7 @@ def sample_shape(self): return [] def sample_dtype(self): - return tf.int32 + return tf.int64 class MultiCategoricalProbabilityDistributionType(ProbabilityDistributionType): @@ -211,13 +211,13 @@ def sample_shape(self): return [len(self.n_vec)] def sample_dtype(self): - return tf.int32 + return tf.int64 class DiagGaussianProbabilityDistributionType(ProbabilityDistributionType): def __init__(self, size): """ - The probability distribution type for multivariate gaussian input + The probability distribution type for multivariate Gaussian input :param size: (int) the number of dimensions of the multivariate gaussian """ @@ -255,9 +255,9 @@ def sample_dtype(self): class BernoulliProbabilityDistributionType(ProbabilityDistributionType): def __init__(self, size): """ - The probability distribution type for bernoulli input + The probability distribution type for Bernoulli input - :param size: (int) the number of dimensions of the bernoulli distribution + :param size: (int) the number of dimensions of the Bernoulli distribution """ self.size = size @@ -353,7 +353,7 @@ def flatparam(self): return self.flat def mode(self): - return tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32) + return tf.stack([p.mode() for p in self.categoricals], axis=-1) def neglogp(self, x): return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x, axis=-1))]) @@ -365,7 +365,7 @@ def entropy(self): return tf.add_n([p.entropy() for p in self.categoricals]) def sample(self): - return tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32) + return tf.stack([p.sample() for p in self.categoricals], axis=-1) @classmethod def fromflat(cls, flat): @@ -381,9 +381,9 @@ def fromflat(cls, flat): class DiagGaussianProbabilityDistribution(ProbabilityDistribution): def __init__(self, flat): """ - Probability distributions from multivariate gaussian input + Probability distributions from multivariate Gaussian input - :param flat: ([float]) the multivariate gaussian input data + :param flat: ([float]) the multivariate Gaussian input data """ self.flat = flat mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat) @@ -421,10 +421,10 @@ def sample(self): @classmethod def fromflat(cls, flat): """ - Create an instance of this from new multivariate gaussian input + Create an instance of this from new multivariate Gaussian input - :param flat: ([float]) the multivariate gaussian input data - :return: (ProbabilityDistribution) the instance from the given multivariate gaussian input data + :param flat: ([float]) the multivariate Gaussian input data + :return: (ProbabilityDistribution) the instance from the given multivariate Gaussian input data """ return cls(flat) @@ -432,9 +432,9 @@ def fromflat(cls, flat): class BernoulliProbabilityDistribution(ProbabilityDistribution): def __init__(self, logits): """ - Probability distributions from bernoulli input + Probability distributions from Bernoulli input - :param logits: ([float]) the bernoulli input data + :param logits: ([float]) the Bernoulli input data """ self.logits = logits self.probabilities = tf.sigmoid(logits) @@ -468,10 +468,10 @@ def sample(self): @classmethod def fromflat(cls, flat): """ - Create an instance of this from new bernoulli input + Create an instance of this from new Bernoulli input - :param flat: ([float]) the bernoulli input data - :return: (ProbabilityDistribution) the instance from the given bernoulli input data + :param flat: ([float]) the Bernoulli input data + :return: (ProbabilityDistribution) the instance from the given Bernoulli input data """ return cls(flat) @@ -481,7 +481,7 @@ def make_proba_dist_type(ac_space): return an instance of ProbabilityDistributionType for the correct type of action space :param ac_space: (Gym Space) the input action space - :return: (ProbabilityDistributionType) the approriate instance of a ProbabilityDistributionType + :return: (ProbabilityDistributionType) the appropriate instance of a ProbabilityDistributionType """ if isinstance(ac_space, spaces.Box): assert len(ac_space.shape) == 1, "Error: the action space must be a vector" diff --git a/stable_baselines/common/env_checker.py b/stable_baselines/common/env_checker.py new file mode 100644 index 0000000000..6c6dd0fcbd --- /dev/null +++ b/stable_baselines/common/env_checker.py @@ -0,0 +1,222 @@ +import warnings +from typing import Union + +import gym +from gym import spaces +import numpy as np + +from stable_baselines.common.vec_env import DummyVecEnv, VecCheckNan + + +def _enforce_array_obs(observation_space: spaces.Space) -> bool: + """ + Whether to check that the returned observation is a numpy array + it is not mandatory for `Dict` and `Tuple` spaces. + """ + return not isinstance(observation_space, (spaces.Dict, spaces.Tuple)) + + +def _check_image_input(observation_space: spaces.Box) -> None: + """ + Check that the input will be compatible with Stable-Baselines + when the observation is apparently an image. + """ + if observation_space.dtype != np.uint8: + warnings.warn("It seems that your observation is an image but the `dtype` " + "of your observation_space is not `np.uint8`. " + "If your observation is not an image, we recommend you to flatten the observation " + "to have only a 1D vector") + + if np.any(observation_space.low != 0) or np.any(observation_space.high != 255): + warnings.warn("It seems that your observation space is an image but the " + "upper and lower bounds are not in [0, 255]. " + "Because the CNN policy normalize automatically the observation " + "you may encounter issue if the values are not in that range." + ) + + if observation_space.shape[0] < 36 or observation_space.shape[1] < 36: + warnings.warn("The minimal resolution for an image is 36x36 for the default CnnPolicy. " + "You might need to use a custom `cnn_extractor` " + "cf https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html") + + +def _check_unsupported_obs_spaces(env: gym.Env, observation_space: spaces.Space) -> None: + """Emit warnings when the observation space used is not supported by Stable-Baselines.""" + + if isinstance(observation_space, spaces.Dict) and not isinstance(env, gym.GoalEnv): + warnings.warn("The observation space is a Dict but the environment is not a gym.GoalEnv " + "(cf https://github.com/openai/gym/blob/master/gym/core.py), " + "this is currently not supported by Stable Baselines " + "(cf https://github.com/hill-a/stable-baselines/issues/133), " + "you will need to use a custom policy. " + ) + + if isinstance(observation_space, spaces.Tuple): + warnings.warn("The observation space is a Tuple," + "this is currently not supported by Stable Baselines " + "(cf https://github.com/hill-a/stable-baselines/issues/133), " + "you will need to flatten the observation and maybe use a custom policy. " + ) + + +def _check_nan(env: gym.Env) -> None: + """Check for Inf and NaN using the VecWrapper.""" + vec_env = VecCheckNan(DummyVecEnv([lambda: env])) + for _ in range(10): + action = [env.action_space.sample()] + _, _, _, _ = vec_env.step(action) + + +def _check_obs(obs: Union[tuple, dict, np.ndarray, int], + observation_space: spaces.Space, + method_name: str) -> None: + """ + Check that the observation returned by the environment + correspond to the declared one. + """ + if not isinstance(observation_space, spaces.Tuple): + assert not isinstance(obs, tuple), ("The observation returned by the `{}()` " + "method should be a single value, not a tuple".format(method_name)) + + # The check for a GoalEnv is done by the base class + if isinstance(observation_space, spaces.Discrete): + assert isinstance(obs, int), "The observation returned by `{}()` method must be an int".format(method_name) + elif _enforce_array_obs(observation_space): + assert isinstance(obs, np.ndarray), ("The observation returned by `{}()` " + "method must be a numpy array".format(method_name)) + + assert observation_space.contains(obs), ("The observation returned by the `{}()` " + "method does not match the given observation space".format(method_name)) + + +def _check_returned_values(env: gym.Env, observation_space: spaces.Space, action_space: spaces.Space) -> None: + """ + Check the returned values by the env when calling `.reset()` or `.step()` methods. + """ + # because env inherits from gym.Env, we assume that `reset()` and `step()` methods exists + obs = env.reset() + + _check_obs(obs, observation_space, 'reset') + + # Sample a random action + action = action_space.sample() + data = env.step(action) + + assert len(data) == 4, "The `step()` method must return four values: obs, reward, done, info" + + # Unpack + obs, reward, done, info = data + + _check_obs(obs, observation_space, 'step') + + # We also allow int because the reward will be cast to float + assert isinstance(reward, (float, int)), "The reward returned by `step()` must be a float" + assert isinstance(done, bool), "The `done` signal must be a boolean" + assert isinstance(info, dict), "The `info` returned by `step()` must be a python dictionary" + + if isinstance(env, gym.GoalEnv): + # For a GoalEnv, the keys are checked at reset + assert reward == env.compute_reward(obs['achieved_goal'], obs['desired_goal'], info) + + +def _check_spaces(env: gym.Env) -> None: + """ + Check that the observation and action spaces are defined + and inherit from gym.spaces.Space. + """ + # Helper to link to the code, because gym has no proper documentation + gym_spaces = " cf https://github.com/openai/gym/blob/master/gym/spaces/" + + assert hasattr(env, 'observation_space'), "You must specify an observation space (cf gym.spaces)" + gym_spaces + assert hasattr(env, 'action_space'), "You must specify an action space (cf gym.spaces)" + gym_spaces + + assert isinstance(env.observation_space, + spaces.Space), "The observation space must inherit from gym.spaces" + gym_spaces + assert isinstance(env.action_space, spaces.Space), "The action space must inherit from gym.spaces" + gym_spaces + + +def _check_render(env: gym.Env, warn=True, headless=False) -> None: + """ + Check the declared render modes and the `render()`/`close()` + method of the environment. + + :param env: (gym.Env) The environment to check + :param warn: (bool) Whether to output additional warnings + :param headless: (bool) Whether to disable render modes + that require a graphical interface. False by default. + """ + render_modes = env.metadata.get('render.modes') + if render_modes is None: + if warn: + warnings.warn("No render modes was declared in the environment " + " (env.metadata['render.modes'] is None or not defined), " + "you may have trouble when calling `.render()`") + + else: + # Don't check render mode that require a + # graphical interface (useful for CI) + if headless and 'human' in render_modes: + render_modes.remove('human') + # Check all declared render modes + for render_mode in render_modes: + env.render(mode=render_mode) + env.close() + + +def check_env(env: gym.Env, warn=True, skip_render_check=True) -> None: + """ + Check that an environment follows Gym API. + This is particularly useful when using a custom environment. + Please take a look at https://github.com/openai/gym/blob/master/gym/core.py + for more information about the API. + + It also optionally check that the environment is compatible with Stable-Baselines. + + :param env: (gym.Env) The Gym environment that will be checked + :param warn: (bool) Whether to output additional warnings + mainly related to the interaction with Stable Baselines + :param skip_render_check: (bool) Whether to skip the checks for the render method. + True by default (useful for the CI) + """ + assert isinstance(env, gym.Env), ("You environment must inherit from gym.Env class " + " cf https://github.com/openai/gym/blob/master/gym/core.py") + + # ============= Check the spaces (observation and action) ================ + _check_spaces(env) + + # Define aliases for convenience + observation_space = env.observation_space + action_space = env.action_space + + # Warn the user if needed. + # A warning means that the environment may run but not work properly with Stable Baselines algorithms + if warn: + _check_unsupported_obs_spaces(env, observation_space) + + # If image, check the low and high values, the type and the number of channels + # and the shape (minimal value) + if isinstance(observation_space, spaces.Box) and len(observation_space.shape) == 3: + _check_image_input(observation_space) + + if isinstance(observation_space, spaces.Box) and len(observation_space.shape) not in [1, 3]: + warnings.warn("Your observation has an unconventional shape (neither an image, nor a 1D vector). " + "We recommend you to flatten the observation " + "to have only a 1D vector") + + # Check for the action space, it may lead to hard-to-debug issues + if (isinstance(action_space, spaces.Box) and + (np.abs(action_space.low) != np.abs(action_space.high) + or np.abs(action_space.low) > 1 or np.abs(action_space.high) > 1)): + warnings.warn("We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) " + "cf https://stable-baselines.readthedocs.io/en/master/guide/rl_tips.html") + + # ============ Check the returned values =============== + _check_returned_values(env, observation_space, action_space) + + # ==== Check the render method and the declared render modes ==== + if not skip_render_check: + _check_render(env, warn=warn) + + # The check only works with numpy arrays + if _enforce_array_obs(observation_space): + _check_nan(env) diff --git a/stable_baselines/common/evaluation.py b/stable_baselines/common/evaluation.py index a8fb7887a3..67e10d06c5 100644 --- a/stable_baselines/common/evaluation.py +++ b/stable_baselines/common/evaluation.py @@ -15,7 +15,7 @@ def evaluate_policy(model, env, n_eval_episodes=10, deterministic=True, this must contain only one environment. :param n_eval_episodes: (int) Number of episode to evaluate the agent :param deterministic: (bool) Whether to use deterministic or stochastic actions - :param render: (bool) Whether to render the environement or not + :param render: (bool) Whether to render the environment or not :param callback: (callable) callback function to do additional checks, called after each step. :param reward_threshold: (float) Minimum expected reward per episode, diff --git a/stable_baselines/common/identity_env.py b/stable_baselines/common/identity_env.py index d8152207c7..e182d47cf3 100644 --- a/stable_baselines/common/identity_env.py +++ b/stable_baselines/common/identity_env.py @@ -5,7 +5,7 @@ class IdentityEnv(Env): - def __init__(self, dim, ep_length=100): + def __init__(self, dim=1, ep_length=100): """ Identity environment for testing purposes diff --git a/stable_baselines/common/math_util.py b/stable_baselines/common/math_util.py index 81bea2ab1a..05c4cd72a5 100644 --- a/stable_baselines/common/math_util.py +++ b/stable_baselines/common/math_util.py @@ -61,7 +61,7 @@ def flatten_arrays(arrs): flattens a list of arrays down to 1D :param arrs: ([np.ndarray]) arrays - :return: (np.ndarray) 1D flattend array + :return: (np.ndarray) 1D flattened array """ return np.concatenate([arr.flat for arr in arrs]) @@ -101,3 +101,29 @@ def discount_with_boundaries(rewards, episode_starts, gamma): for step in range(n_samples - 2, -1, -1): discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1]) return discounted_rewards + + +def scale_action(action_space, action): + """ + Rescale the action from [low, high] to [-1, 1] + (no need for symmetric action space) + + :param action_space: (gym.spaces.box.Box) + :param action: (np.ndarray) + :return: (np.ndarray) + """ + low, high = action_space.low, action_space.high + return 2.0 * ((action - low) / (high - low)) - 1.0 + + +def unscale_action(action_space, scaled_action): + """ + Rescale the action from [-1, 1] to [low, high] + (no need for symmetric action space) + + :param action_space: (gym.spaces.box.Box) + :param action: (np.ndarray) + :return: (np.ndarray) + """ + low, high = action_space.low, action_space.high + return low + (0.5 * (scaled_action + 1.0) * (high - low)) diff --git a/stable_baselines/common/noise.py b/stable_baselines/common/noise.py index 446aced590..caecc55afb 100644 --- a/stable_baselines/common/noise.py +++ b/stable_baselines/common/noise.py @@ -55,7 +55,7 @@ def reset(self): class NormalActionNoise(ActionNoise): """ - A gaussian action noise + A Gaussian action noise :param mean: (float) the mean value of the noise :param sigma: (float) the scale of the noise (std here) @@ -73,7 +73,7 @@ def __repr__(self): class OrnsteinUhlenbeckActionNoise(ActionNoise): """ - A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction. + A Ornstein Uhlenbeck action noise, this is designed to approximate brownian motion with friction. Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab diff --git a/stable_baselines/common/policies.py b/stable_baselines/common/policies.py index d9e16cd092..eced062f64 100644 --- a/stable_baselines/common/policies.py +++ b/stable_baselines/common/policies.py @@ -101,7 +101,7 @@ class BasePolicy(ABC): :param reuse: (bool) If the policy is reusable or not :param scale: (bool) whether or not to scale the input :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param add_action_ph: (bool) whether or not to create an action placeholder """ @@ -171,9 +171,9 @@ def _kwargs_check(feature_extraction, kwargs): # When using policy_kwargs parameter on model creation, # all keywords arguments must be consumed by the policy constructor except # the ones for the cnn_extractor network (cf nature_cnn()), where the keywords arguments - # are not passed explicitely (using **kwargs to forward the arguments) + # are not passed explicitly (using **kwargs to forward the arguments) # that's why there should be not kwargs left when using the mlp_extractor - # (in that case the keywords arguments are passed explicitely) + # (in that case the keywords arguments are passed explicitly) if feature_extraction == 'mlp' and len(kwargs) > 0: raise ValueError("Unknown keywords for policy: {}".format(kwargs)) diff --git a/stable_baselines/common/schedules.py b/stable_baselines/common/schedules.py index 57f4013fb8..f20b7887a3 100644 --- a/stable_baselines/common/schedules.py +++ b/stable_baselines/common/schedules.py @@ -53,7 +53,7 @@ class PiecewiseSchedule(Schedule): Piecewise schedule. :param endpoints: ([(int, int)]) - list of pairs `(time, value)` meanining that schedule should output + list of pairs `(time, value)` meaning that schedule should output `value` when `t==time`. All the values for time must be sorted in an increasing order. When t is between two times, e.g. `(time_a, value_a)` and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs @@ -64,7 +64,7 @@ class PiecewiseSchedule(Schedule): to the `endpoints`. Alpha is the fraction of distance from left endpoint to right endpoint that t has covered. See linear_interpolation for example. :param outside_value: (float) - if the value is requested outside of all the intervals sepecified in + if the value is requested outside of all the intervals specified in `endpoints` this value is returned. If None then AssertionError is raised when outside value is requested. """ diff --git a/stable_baselines/common/tf_util.py b/stable_baselines/common/tf_util.py index 6ec362b140..e737fa4a83 100644 --- a/stable_baselines/common/tf_util.py +++ b/stable_baselines/common/tf_util.py @@ -30,8 +30,8 @@ def huber_loss(tensor, delta=1.0): Reference: https://en.wikipedia.org/wiki/Huber_loss :param tensor: (TensorFlow Tensor) the input value - :param delta: (float) huber loss delta value - :return: (TensorFlow Tensor) huber loss output + :param delta: (float) Huber loss delta value + :return: (TensorFlow Tensor) Huber loss output """ return tf.where( tf.abs(tensor) < delta, @@ -80,7 +80,7 @@ def single_threaded_session(make_default=False, graph=None): def in_session(func): """ - wrappes a function so that it is in a TensorFlow Session + Wraps a function so that it is in a TensorFlow Session :param func: (function) the function to wrap :return: (function) @@ -241,7 +241,7 @@ def flatgrad(loss, var_list, clip_norm=None): :param loss: (float) the loss value :param var_list: ([TensorFlow Tensor]) the variables :param clip_norm: (float) clip the gradients (disabled if None) - :return: ([TensorFlow Tensor]) flattend gradient + :return: ([TensorFlow Tensor]) flattened gradient """ grads = tf.gradients(loss, var_list) if clip_norm is not None: diff --git a/stable_baselines/common/vec_env/dummy_vec_env.py b/stable_baselines/common/vec_env/dummy_vec_env.py index c5ee1d7def..2fb9d7b962 100644 --- a/stable_baselines/common/vec_env/dummy_vec_env.py +++ b/stable_baselines/common/vec_env/dummy_vec_env.py @@ -12,7 +12,8 @@ class DummyVecEnv(VecEnv): multiprocess or multithread outweighs the environment computation time. This can also be used for RL methods that require a vectorized environment, but that you want a single environments to train with. - :param env_fns: ([Gym Environment]) the list of environments to vectorize + :param env_fns: ([callable]) A list of functions that will create the environments + (each callable returns a `Gym.Env` instance when called). """ def __init__(self, env_fns): diff --git a/stable_baselines/common/vec_env/subproc_vec_env.py b/stable_baselines/common/vec_env/subproc_vec_env.py index 0fc3aae84b..2cc451a298 100644 --- a/stable_baselines/common/vec_env/subproc_vec_env.py +++ b/stable_baselines/common/vec_env/subproc_vec_env.py @@ -62,7 +62,8 @@ class SubprocVecEnv(VecEnv): ``if __name__ == "__main__":`` block. For more information, see the multiprocessing documentation. - :param env_fns: ([Gym Environment]) Environments to run in subprocesses + :param env_fns: ([callable]) A list of functions that will create the environments + (each callable returns a `Gym.Env` instance when called). :param start_method: (str) method used to start the subprocesses. Must be one of the methods returned by multiprocessing.get_all_start_methods(). Defaults to 'forkserver' on available platforms, and 'spawn' otherwise. diff --git a/stable_baselines/common/vec_env/vec_normalize.py b/stable_baselines/common/vec_env/vec_normalize.py index dc93c5ecbf..6ab308b13f 100644 --- a/stable_baselines/common/vec_env/vec_normalize.py +++ b/stable_baselines/common/vec_env/vec_normalize.py @@ -39,7 +39,8 @@ def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward - self.old_obs = np.array([]) + self.old_obs = None + self.old_rews = None def __getstate__(self): """ @@ -88,48 +89,69 @@ def step_wait(self): where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() - self.ret = self.ret * self.gamma + rews self.old_obs = obs - obs = self._normalize_observation(obs) - if self.norm_reward: - if self.training: - self.ret_rms.update(self.ret) - rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) + self.old_rews = rews + + if self.training: + self.obs_rms.update(obs) + obs = self.normalize_obs(obs) + + if self.training: + self._update_reward(rews) + rews = self.normalize_reward(rews) + self.ret[news] = 0 return obs, rews, news, infos - def _normalize_observation(self, obs): + def _update_reward(self, reward: np.ndarray) -> None: + """Update reward normalization statistics.""" + self.ret = self.ret * self.gamma + reward + self.ret_rms.update(self.ret) + + def normalize_obs(self, obs: np.ndarray) -> np.ndarray: """ - :param obs: (numpy tensor) + Normalize observations using this VecNormalize's observations statistics. + Calling this method does not update statistics. """ if self.norm_obs: - if self.training: - self.obs_rms.update(obs) - obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, + obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), + -self.clip_obs, self.clip_obs) - return obs - else: - return obs + return obs + + def normalize_reward(self, reward: np.ndarray) -> np.ndarray: + """ + Normalize rewards using this VecNormalize's rewards statistics. + Calling this method does not update statistics. + """ + if self.norm_reward: + reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), + -self.clip_reward, self.clip_reward) + return reward - def get_original_obs(self): + def get_original_obs(self) -> np.ndarray: """ - returns the unnormalized observation + Returns an unnormalized version of the observations from the most recent + step or reset. + """ + return self.old_obs.copy() - :return: (numpy float) + def get_original_reward(self) -> np.ndarray: + """ + Returns an unnormalized version of the rewards from the most recent step. """ - return self.old_obs + return self.old_rews.copy() def reset(self): """ Reset all environments """ obs = self.venv.reset() - if len(np.array(obs).shape) == 1: # for when num_cpu is 1 - self.old_obs = [obs] - else: - self.old_obs = obs + self.old_obs = obs self.ret = np.zeros(self.num_envs) - return self._normalize_observation(obs) + if self.training: + self._update_reward(self.ret) + return self.normalize_obs(obs) @staticmethod def load(load_path, venv): diff --git a/stable_baselines/ddpg/ddpg.py b/stable_baselines/ddpg/ddpg.py index 3314044ca2..94896d2faf 100644 --- a/stable_baselines/ddpg/ddpg.py +++ b/stable_baselines/ddpg/ddpg.py @@ -15,6 +15,7 @@ from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter from stable_baselines.common.vec_env import VecEnv from stable_baselines.common.mpi_adam import MpiAdam +from stable_baselines.common.math_util import unscale_action, scale_action from stable_baselines.ddpg.policies import DDPGPolicy from stable_baselines.common.mpi_running_mean_std import RunningMeanStd from stable_baselines.a2c.utils import total_episode_reward_logger @@ -127,7 +128,7 @@ def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verb if var in get_perturbable_vars(actor): if verbose >= 2: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) - # Add gaussian noise to the parameter + # Add Gaussian noise to the parameter updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: @@ -156,7 +157,7 @@ class DDPG(OffPolicyRLModel): :param eval_env: (Gym Environment) the evaluation environment (can be None) :param nb_train_steps: (int) the number of training steps :param nb_rollout_steps: (int) the number of rollout steps - :param nb_eval_steps: (int) the number of evalutation steps + :param nb_eval_steps: (int) the number of evaluation steps :param param_noise: (AdaptiveParamNoiseSpec) the parameter noise type (can be None) :param action_noise: (ActionNoise) the action noise type (can be None) :param param_noise_adaption_interval: (int) apply param noise every N steps @@ -174,7 +175,7 @@ class DDPG(OffPolicyRLModel): :param clip_norm: (float) clip the gradients (disabled if None) :param reward_scale: (float) the value the reward should be scaled by :param render: (bool) enable rendering of the environment - :param render_eval: (bool) enable rendering of the evalution environment + :param render_eval: (bool) enable rendering of the evaluation environment :param memory_limit: (int) the max number of transitions to store, size of the replay buffer .. deprecated:: 2.6.0 @@ -312,7 +313,7 @@ def __init__(self, policy, env, gamma=0.99, memory_policy=None, eval_env=None, n def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale - deterministic_action = self.actor_tf * np.abs(self.action_space.low) + deterministic_action = unscale_action(self.action_space, self.actor_tf) return policy.obs_ph, self.actions, deterministic_action def setup_model(self): @@ -818,8 +819,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D self.tb_seen_steps = [] rank = MPI.COMM_WORLD.Get_rank() - # we assume symmetric actions. - assert np.all(np.abs(self.env.action_space.low) == self.env.action_space.high) + if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) @@ -870,13 +870,17 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D self.env.render() # Randomly sample actions from a uniform distribution - # with a probabilty self.random_exploration (used in HER + DDPG) + # with a probability self.random_exploration (used in HER + DDPG) if np.random.rand() < self.random_exploration: - rescaled_action = action = self.action_space.sample() + # actions sampled from action space are from range specific to the environment + # but algorithm operates on tanh-squashed actions therefore simple scaling is used + unscaled_action = self.action_space.sample() + action = scale_action(self.action_space, unscaled_action) else: - rescaled_action = action * np.abs(self.action_space.low) + # inferred actions need to be transformed to environment action_space before stepping + unscaled_action = unscale_action(self.action_space, action) - new_obs, reward, done, info = self.env.step(rescaled_action) + new_obs, reward, done, info = self.env.step(unscaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) @@ -955,8 +959,8 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D return self eval_action, eval_q = self._policy(eval_obs, apply_noise=False, compute_q=True) - eval_obs, eval_r, eval_done, _ = self.eval_env.step(eval_action * - np.abs(self.action_space.low)) + unscaled_action = unscale_action(self.action_space, eval_action) + eval_obs, eval_r, eval_done, _ = self.eval_env.step(unscaled_action) if self.render_eval: self.eval_env.render() eval_episode_reward += eval_r @@ -1041,7 +1045,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True): observation = observation.reshape((-1,) + self.observation_space.shape) actions, _, = self._policy(observation, apply_noise=not deterministic, compute_q=False) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = actions * np.abs(self.action_space.low) # scale the output for the prediction + actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] diff --git a/stable_baselines/ddpg/policies.py b/stable_baselines/ddpg/policies.py index 00a4c030bc..37e0e26e8b 100644 --- a/stable_baselines/ddpg/policies.py +++ b/stable_baselines/ddpg/policies.py @@ -23,7 +23,6 @@ def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=Fals super(DDPGPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale, add_action_ph=True) assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric" self.qvalue_fn = None self.policy = None @@ -32,7 +31,7 @@ def make_actor(self, obs=None, reuse=False, scope="pi"): creates an actor object :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name of the actor :return: (TensorFlow Tensor) the output tensor """ @@ -44,7 +43,7 @@ def make_critic(self, obs=None, action=None, reuse=False, scope="qf"): :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) :param action: (TensorFlow Tensor) The action placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name of the critic :return: (TensorFlow Tensor) the output tensor """ diff --git a/stable_baselines/deepq/build_graph.py b/stable_baselines/deepq/build_graph.py index b6a9d39589..51453ec6e5 100644 --- a/stable_baselines/deepq/build_graph.py +++ b/stable_baselines/deepq/build_graph.py @@ -134,7 +134,7 @@ def build_act(q_func, ob_space, ac_space, stochastic_ph, update_eps_ph, sess): :param sess: (TensorFlow session) The current TensorFlow session :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), - A tuple containing the observation placeholder and the processed observation placeholder respectivly. + A tuple containing the observation placeholder and the processed observation placeholder respectively. """ eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) @@ -177,7 +177,7 @@ def build_act_with_param_noise(q_func, ob_space, ac_space, stochastic_ph, update is used by default. :return: (function (TensorFlow Tensor, bool, float): TensorFlow Tensor, (TensorFlow Tensor, TensorFlow Tensor) act function to select and action given observation (See the top of the file for details), - A tuple containing the observation placeholder and the processed observation placeholder respectivly. + A tuple containing the observation placeholder and the processed observation placeholder respectively. """ if param_noise_filter_func is None: param_noise_filter_func = default_param_noise_filter diff --git a/stable_baselines/deepq/dqn.py b/stable_baselines/deepq/dqn.py index d4457f2984..d85366e698 100644 --- a/stable_baselines/deepq/dqn.py +++ b/stable_baselines/deepq/dqn.py @@ -178,7 +178,6 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER" self.replay_buffer = replay_wrapper(self.replay_buffer) - # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=self.exploration_initial_eps, @@ -242,7 +241,7 @@ def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="D # or if there are not enough samples in the replay buffer can_sample = self.replay_buffer.can_sample(self.batch_size) if can_sample and self.num_timesteps > self.learning_starts \ - and self.num_timesteps % self.train_freq == 0: + and self.num_timesteps % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. # pytype:disable=bad-unpacking if self.prioritized_replay: diff --git a/stable_baselines/deepq/policies.py b/stable_baselines/deepq/policies.py index 5128f5467e..3a2dfec16d 100644 --- a/stable_baselines/deepq/policies.py +++ b/stable_baselines/deepq/policies.py @@ -19,7 +19,7 @@ class DQNPolicy(BasePolicy): :param reuse: (bool) If the policy is reusable or not :param scale: (bool) whether or not to scale the input :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param dueling: (bool) if true double the output MLP to compute a baseline for action scores """ @@ -81,7 +81,7 @@ class FeedForwardPolicy(DQNPolicy): :param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction :param feature_extraction: (str) The feature extraction type ("cnn" or "mlp") :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param layer_norm: (bool) enable layer normalisation :param dueling: (bool) if true double the output MLP to compute a baseline for action scores :param act_fun: (tf.func) the activation function to use in the neural network. @@ -164,7 +164,7 @@ class CnnPolicy(FeedForwardPolicy): :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param dueling: (bool) if true double the output MLP to compute a baseline for action scores :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ @@ -188,7 +188,7 @@ class LnCnnPolicy(FeedForwardPolicy): :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param dueling: (bool) if true double the output MLP to compute a baseline for action scores :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ @@ -212,7 +212,7 @@ class MlpPolicy(FeedForwardPolicy): :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param dueling: (bool) if true double the output MLP to compute a baseline for action scores :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ @@ -236,7 +236,7 @@ class LnMlpPolicy(FeedForwardPolicy): :param n_batch: (int) The number of batch to run (n_envs * n_steps) :param reuse: (bool) If the policy is reusable or not :param obs_phs: (TensorFlow Tensor, TensorFlow Tensor) a tuple containing an override for observation placeholder - and the processed observation placeholder respectivly + and the processed observation placeholder respectively :param dueling: (bool) if true double the output MLP to compute a baseline for action scores :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction """ diff --git a/stable_baselines/deepq/replay_buffer.py b/stable_baselines/deepq/replay_buffer.py index 6c78328829..b274e51597 100644 --- a/stable_baselines/deepq/replay_buffer.py +++ b/stable_baselines/deepq/replay_buffer.py @@ -22,7 +22,7 @@ def __len__(self): @property def storage(self): - """[(np.ndarray, float, float, np.ndarray, bool)]: content of the replay buffer""" + """[(Union[np.ndarray, int], Union[np.ndarray, int], float, Union[np.ndarray, int], bool)]: content of the replay buffer""" return self._storage @property @@ -52,10 +52,10 @@ def add(self, obs_t, action, reward, obs_tp1, done): """ add a new transition to the buffer - :param obs_t: (Any) the last observation - :param action: ([float]) the action + :param obs_t: (Union[np.ndarray, int]) the last observation + :param action: (Union[np.ndarray, int]) the action :param reward: (float) the reward of the transition - :param obs_tp1: (Any) the current observation + :param obs_tp1: (Union[np.ndarray, int]) the current observation :param done: (bool) is the episode done """ data = (obs_t, action, reward, obs_tp1, done) diff --git a/stable_baselines/gail/adversary.py b/stable_baselines/gail/adversary.py index ade1d977c1..7c6cb63c68 100644 --- a/stable_baselines/gail/adversary.py +++ b/stable_baselines/gail/adversary.py @@ -26,7 +26,7 @@ def logit_bernoulli_entropy(logits): https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51 :param logits: (tf.Tensor) the logits - :return: (tf.Tensor) the bernoulli entropy + :return: (tf.Tensor) the Bernoulli entropy """ ent = (1. - tf.nn.sigmoid(logits)) * logits - logsigmoid(logits) return ent diff --git a/stable_baselines/gail/dataset/dataset.py b/stable_baselines/gail/dataset/dataset.py index b64236154e..6b8aa035da 100644 --- a/stable_baselines/gail/dataset/dataset.py +++ b/stable_baselines/gail/dataset/dataset.py @@ -181,7 +181,7 @@ class DataLoader(object): :param actions: (np.ndarray) actions :param batch_size: (int) Number of samples per minibatch :param n_workers: (int) number of preprocessing worker (for loading the images) - :param infinite_loop: (bool) whether to have an iterator that can be resetted + :param infinite_loop: (bool) whether to have an iterator that can be reset :param max_queue_len: (int) Max number of minibatches that can be preprocessed at the same time :param shuffle: (bool) Shuffle the minibatch after each epoch :param start_process: (bool) Start the preprocessing process (default: True) diff --git a/stable_baselines/ppo2/ppo2.py b/stable_baselines/ppo2/ppo2.py index 6d998d2d18..af55db4b10 100644 --- a/stable_baselines/ppo2/ppo2.py +++ b/stable_baselines/ppo2/ppo2.py @@ -220,7 +220,7 @@ def setup_model(self): if self.clip_range_vf_ph is not None: tf.summary.scalar('clip_range_vf', tf.reduce_mean(self.clip_range_vf_ph)) - tf.summary.scalar('old_neglog_action_probabilty', tf.reduce_mean(self.old_neglog_pac_ph)) + tf.summary.scalar('old_neglog_action_probability', tf.reduce_mean(self.old_neglog_pac_ph)) tf.summary.scalar('old_value_pred', tf.reduce_mean(self.old_vpred_ph)) if self.full_tensorboard_log: @@ -228,7 +228,7 @@ def setup_model(self): tf.summary.histogram('learning_rate', self.learning_rate_ph) tf.summary.histogram('advantage', self.advs_ph) tf.summary.histogram('clip_range', self.clip_range_ph) - tf.summary.histogram('old_neglog_action_probabilty', self.old_neglog_pac_ph) + tf.summary.histogram('old_neglog_action_probability', self.old_neglog_pac_ph) tf.summary.histogram('old_value_pred', self.old_vpred_ph) if tf_util.is_image(self.observation_space): tf.summary.image('observation', train_model.obs_ph) @@ -324,7 +324,11 @@ def learn(self, total_timesteps, callback=None, log_interval=1, tb_log_name="PPO n_updates = total_timesteps // self.n_batch for update in range(1, n_updates + 1): - assert self.n_batch % self.nminibatches == 0 + assert self.n_batch % self.nminibatches == 0, ("The number of minibatches (`nminibatches`) " + "is not a factor of the total number of samples " + "collected per rollout (`n_batch`), " + "some samples won't be used." + ) batch_size = self.n_batch // self.nminibatches t_start = time.time() frac = 1.0 - (update - 1.0) / n_updates diff --git a/stable_baselines/sac/policies.py b/stable_baselines/sac/policies.py index 2d2c5053cc..b9337e98a6 100644 --- a/stable_baselines/sac/policies.py +++ b/stable_baselines/sac/policies.py @@ -26,7 +26,7 @@ def gaussian_likelihood(input_, mu_, log_std): def gaussian_entropy(log_std): """ - Compute the entropy for a diagonal gaussian distribution. + Compute the entropy for a diagonal Gaussian distribution. :param log_std: (tf.Tensor) Log of the standard deviation :return: (tf.Tensor) @@ -61,7 +61,7 @@ def clip_but_pass_gradient(input_, lower=-1., upper=1.): def apply_squashing_func(mu_, pi_, logp_pi): """ - Squash the ouput of the gaussian distribution + Squash the output of the Gaussian distribution and account for that in the log probability The squashed mean is also returned for using deterministic actions. @@ -99,7 +99,6 @@ class SACPolicy(BasePolicy): def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False): super(SACPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric" self.qf1 = None self.qf2 = None @@ -114,7 +113,7 @@ def make_actor(self, obs=None, reuse=False, scope="pi"): Creates an actor object :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name of the actor :return: (TensorFlow Tensor) the output tensor """ @@ -127,7 +126,7 @@ def make_critics(self, obs=None, action=None, reuse=False, :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) :param action: (TensorFlow Tensor) The action placeholder - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name :param create_vf: (bool) Whether to create Value fn or not :param create_qf: (bool) Whether to create Q-Values fn or not @@ -236,7 +235,7 @@ def make_actor(self, obs=None, reuse=False, scope="pi"): logp_pi = gaussian_likelihood(pi_, mu_, log_std) self.entropy = gaussian_entropy(log_std) # MISSING: reg params for log and mu - # Apply squashing and account for it in the probabilty + # Apply squashing and account for it in the probability deterministic_policy, policy, logp_pi = apply_squashing_func(mu_, pi_, logp_pi) self.policy = policy self.deterministic_policy = deterministic_policy diff --git a/stable_baselines/sac/sac.py b/stable_baselines/sac/sac.py index 8712806e5f..33ef511249 100644 --- a/stable_baselines/sac/sac.py +++ b/stable_baselines/sac/sac.py @@ -9,6 +9,7 @@ from stable_baselines.a2c.utils import total_episode_reward_logger from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter from stable_baselines.common.vec_env import VecEnv +from stable_baselines.common.math_util import unscale_action, scale_action from stable_baselines.deepq.replay_buffer import ReplayBuffer from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn from stable_baselines.sac.policies import SACPolicy @@ -139,7 +140,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000 def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale - deterministic_action = self.deterministic_action * np.abs(self.action_space.low) + deterministic_action = unscale_action(self.action_space, self.deterministic_action) return policy.obs_ph, self.actions_ph, deterministic_action def setup_model(self): @@ -175,7 +176,7 @@ def setup_model(self): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training - # logp_pi is the log probabilty of actions taken by the policy + # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training @@ -249,7 +250,7 @@ def setup_model(self): policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional - # regularization loss for the gaussian parameters + # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss @@ -405,22 +406,23 @@ def learn(self, total_timesteps, callback=None, # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) - if (self.num_timesteps < self.learning_starts - or np.random.rand() < self.random_exploration): - # No need to rescale when sampling random action - rescaled_action = action = self.env.action_space.sample() + if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: + # actions sampled from action space are from range specific to the environment + # but algorithm operates on tanh-squashed actions therefore simple scaling is used + unscaled_action = self.env.action_space.sample() + action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None], deterministic=False).flatten() # Add noise to the action (improve exploration, # not needed in general) if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) - # Rescale from [-1, 1] to the correct bounds - rescaled_action = action * np.abs(self.action_space.low) + # inferred actions need to be transformed to environment action_space before stepping + unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape - new_obs, reward, done, info = self.env.step(rescaled_action) + new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) @@ -508,7 +510,7 @@ def action_probability(self, observation, state=None, mask=None, actions=None, l raise ValueError("Error: SAC does not have action probabilities.") warnings.warn("Even though SAC has a Gaussian policy, it cannot return a distribution as it " - "is squashed by a tanh before being scaled and ouputed.") + "is squashed by a tanh before being scaled and outputed.") return None @@ -519,7 +521,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True): observation = observation.reshape((-1,) + self.observation_space.shape) actions = self.policy_tf.step(observation, deterministic=deterministic) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = actions * np.abs(self.action_space.low) # scale the output for the prediction + actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] diff --git a/stable_baselines/td3/policies.py b/stable_baselines/td3/policies.py index 9e0c83fd6b..d1b42ba142 100644 --- a/stable_baselines/td3/policies.py +++ b/stable_baselines/td3/policies.py @@ -23,7 +23,6 @@ class TD3Policy(BasePolicy): def __init__(self, sess, ob_space, ac_space, n_env=1, n_steps=1, n_batch=None, reuse=False, scale=False): super(TD3Policy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=scale) assert isinstance(ac_space, Box), "Error: the action space must be of type gym.spaces.Box" - assert (np.abs(ac_space.low) == ac_space.high).all(), "Error: the action space low and high must be symmetric" self.qf1 = None self.qf2 = None @@ -34,7 +33,7 @@ def make_actor(self, obs=None, reuse=False, scope="pi"): Creates an actor object :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name of the actor :return: (TensorFlow Tensor) the output tensor """ @@ -47,7 +46,7 @@ def make_critics(self, obs=None, action=None, reuse=False, :param obs: (TensorFlow Tensor) The observation placeholder (can be None for default placeholder) :param action: (TensorFlow Tensor) The action placeholder - :param reuse: (bool) whether or not to resue parameters + :param reuse: (bool) whether or not to reuse parameters :param scope: (str) the scope name :return: ([tf.Tensor]) Mean, action and log probability """ diff --git a/stable_baselines/td3/td3.py b/stable_baselines/td3/td3.py index eb0dd0fb3b..2f66a44082 100644 --- a/stable_baselines/td3/td3.py +++ b/stable_baselines/td3/td3.py @@ -9,6 +9,7 @@ from stable_baselines.a2c.utils import total_episode_reward_logger from stable_baselines.common import tf_util, OffPolicyRLModel, SetVerbosity, TensorboardWriter from stable_baselines.common.vec_env import VecEnv +from stable_baselines.common.math_util import unscale_action, scale_action from stable_baselines.deepq.replay_buffer import ReplayBuffer from stable_baselines.ppo2.ppo2 import safe_mean, get_schedule_fn from stable_baselines.sac.sac import get_vars @@ -37,7 +38,7 @@ class TD3(OffPolicyRLModel): :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps per training steps. The Q values will be updated policy_delay more often (update every training step). :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type. - :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy + :param target_policy_noise: (float) Standard deviation of Gaussian noise added to target policy (smoothing noise) :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise. :param train_freq: (int) Update the model every `train_freq` steps. @@ -120,7 +121,7 @@ def __init__(self, policy, env, gamma=0.99, learning_rate=3e-4, buffer_size=5000 def _get_pretrain_placeholders(self): policy = self.policy_tf # Rescale - policy_out = self.policy_out * np.abs(self.action_space.low) + policy_out = unscale_action(self.action_space, self.policy_out) return policy.obs_ph, self.actions_ph, policy_out def setup_model(self): @@ -316,10 +317,11 @@ def learn(self, total_timesteps, callback=None, # from a uniform distribution for better exploration. # Afterwards, use the learned policy # if random_exploration is set to 0 (normal setting) - if (self.num_timesteps < self.learning_starts - or np.random.rand() < self.random_exploration): - # No need to rescale when sampling random action - rescaled_action = action = self.env.action_space.sample() + if self.num_timesteps < self.learning_starts or np.random.rand() < self.random_exploration: + # actions sampled from action space are from range specific to the environment + # but algorithm operates on tanh-squashed actions therefore simple scaling is used + unscaled_action = self.env.action_space.sample() + action = scale_action(self.action_space, unscaled_action) else: action = self.policy_tf.step(obs[None]).flatten() # Add noise to the action, as the policy @@ -327,11 +329,11 @@ def learn(self, total_timesteps, callback=None, if self.action_noise is not None: action = np.clip(action + self.action_noise(), -1, 1) # Rescale from [-1, 1] to the correct bounds - rescaled_action = action * np.abs(self.action_space.low) + unscaled_action = unscale_action(self.action_space, action) assert action.shape == self.env.action_space.shape - new_obs, reward, done, info = self.env.step(rescaled_action) + new_obs, reward, done, info = self.env.step(unscaled_action) # Store transition in the replay buffer. self.replay_buffer.add(obs, action, reward, new_obs, float(done)) @@ -435,7 +437,7 @@ def predict(self, observation, state=None, mask=None, deterministic=True): actions = np.clip(actions + self.action_noise(), -1, 1) actions = actions.reshape((-1,) + self.action_space.shape) # reshape to the correct action shape - actions = actions * np.abs(self.action_space.low) # scale the output for the prediction + actions = unscale_action(self.action_space, actions) # scale the output for the prediction if not vectorized_env: actions = actions[0] diff --git a/tests/test_0deterministic.py b/tests/test_0deterministic.py index 1ac6e855fd..506468d04b 100644 --- a/tests/test_0deterministic.py +++ b/tests/test_0deterministic.py @@ -1,6 +1,6 @@ import pytest -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO, TD3 +# from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO, TD3 from stable_baselines.common.noise import NormalActionNoise N_STEPS_TRAINING = 5000 @@ -8,7 +8,8 @@ # Weird stuff: TD3 would fail if another algorithm is tested before # with n_cpu_tf_sess > 1 -@pytest.mark.parametrize("algo", [A2C, ACKTR, ACER, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3]) +# @pytest.mark.parametrize("algo", [A2C, ACKTR, ACER, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3]) +@pytest.mark.parametrize("algo", []) def test_deterministic_training_common(algo): results = [[], []] rewards = [[], []] diff --git a/tests/test_a2c_conv.py b/tests/test_a2c_conv.py index 99953b940f..f77e3d4780 100644 --- a/tests/test_a2c_conv.py +++ b/tests/test_a2c_conv.py @@ -1,15 +1,17 @@ +import pytest + import gym import numpy as np import tensorflow as tf -from stable_baselines.a2c.utils import conv -from stable_baselines.common.input import observation_input +# from stable_baselines.a2c.utils import conv +# from stable_baselines.common.input import observation_input ENV_ID = 'BreakoutNoFrameskip-v4' SEED = 3 - +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_conv_kernel(): """Test convolution kernel with various input formats.""" filter_size_1 = 4 # The size of squared filter for the first layer diff --git a/tests/test_action_scaling.py b/tests/test_action_scaling.py new file mode 100644 index 0000000000..5d6ca20f4d --- /dev/null +++ b/tests/test_action_scaling.py @@ -0,0 +1,45 @@ +import pytest +import numpy as np + +from stable_baselines import DDPG, TD3, SAC +from stable_baselines.common.identity_env import IdentityEnvBox + +ROLLOUT_STEPS = 100 + +MODEL_LIST = [ + (DDPG, dict(nb_train_steps=0, nb_rollout_steps=ROLLOUT_STEPS)), + (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)), + (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=0)), + (TD3, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)), + (SAC, dict(train_freq=ROLLOUT_STEPS + 1, learning_starts=ROLLOUT_STEPS)) +] + + +@pytest.mark.parametrize("model_class, model_kwargs", MODEL_LIST) +def test_buffer_actions_scaling(model_class, model_kwargs): + """ + Test if actions are scaled to tanh co-domain before being put in a buffer + for algorithms that use tanh-squashing, i.e., DDPG, TD3, SAC + + :param model_class: (BaseRLModel) A RL Model + :param model_kwargs: (dict) Dictionary containing named arguments to the given algorithm + """ + + # check random and inferred actions as they possibly have different flows + for random_coeff in [0.0, 1.0]: + + env = IdentityEnvBox(-2000, 1000) + + model = model_class("MlpPolicy", env, seed=1, random_exploration=random_coeff, **model_kwargs) + model.learn(total_timesteps=ROLLOUT_STEPS) + + assert hasattr(model, 'replay_buffer') + + buffer = model.replay_buffer + + assert buffer.can_sample(ROLLOUT_STEPS) + + _, actions, _, _, _ = buffer.sample(ROLLOUT_STEPS) + + assert not np.any(actions > np.ones_like(actions)) + assert not np.any(actions < -np.ones_like(actions)) diff --git a/tests/test_action_space.py b/tests/test_action_space.py index eefe0dabff..4d5d91b2aa 100644 --- a/tests/test_action_space.py +++ b/tests/test_action_space.py @@ -1,17 +1,18 @@ import pytest import numpy as np -from stable_baselines import A2C, PPO1, PPO2, TRPO +# from stable_baselines import A2C, PPO1, PPO2, TRPO from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy -MODEL_LIST = [ - A2C, - PPO1, - PPO2, - TRPO -] +# MODEL_LIST = [ +# A2C, +# PPO1, +# PPO2, +# TRPO +# ] +MODEL_LIST = [] @pytest.mark.slow diff --git a/tests/test_atari.py b/tests/test_atari.py index 2b94da238d..8850564b42 100644 --- a/tests/test_atari.py +++ b/tests/test_atari.py @@ -1,56 +1,23 @@ import pytest from stable_baselines import bench, logger -from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy +# from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy from stable_baselines.common import set_global_seeds from stable_baselines.common.atari_wrappers import make_atari -import stable_baselines.a2c.run_atari as a2c_atari -import stable_baselines.acer.run_atari as acer_atari -import stable_baselines.acktr.run_atari as acktr_atari -import stable_baselines.ppo1.run_atari as ppo1_atari -import stable_baselines.ppo2.run_atari as ppo2_atari -import stable_baselines.trpo_mpi.run_atari as trpo_atari +# import stable_baselines.a2c.run_atari as a2c_atari +# import stable_baselines.acer.run_atari as acer_atari +# import stable_baselines.acktr.run_atari as acktr_atari +# import stable_baselines.ppo1.run_atari as ppo1_atari +# import stable_baselines.ppo2.run_atari as ppo2_atari +# import stable_baselines.trpo_mpi.run_atari as trpo_atari ENV_ID = 'BreakoutNoFrameskip-v4' SEED = 3 NUM_TIMESTEPS = 500 -NUM_CPU = 2 - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm']) -def test_a2c(policy): - """ - test A2C on atari - - :param policy: (str) the policy to test for A2C - """ - a2c_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, - policy=policy, lr_schedule='constant', num_env=NUM_CPU) - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm']) -def test_acer(policy): - """ - test ACER on atari - - :param policy: (str) the policy to test for ACER - """ - acer_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, - policy=policy, lr_schedule='constant', num_cpu=NUM_CPU) - - -@pytest.mark.slow -def test_acktr(): - """ - test ACKTR on atari - """ - acktr_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED, num_cpu=NUM_CPU) - @pytest.mark.slow +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_deepq(): """ test DeepQ on atari @@ -68,32 +35,3 @@ def test_deepq(): env.close() del model, env - - -@pytest.mark.slow -def test_ppo1(): - """ - test PPO1 on atari - """ - ppo1_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) - - -@pytest.mark.slow -@pytest.mark.parametrize("policy", ['cnn', 'lstm', 'lnlstm', 'mlp']) -def test_ppo2(policy): - """ - test PPO2 on atari - - :param policy: (str) the policy to test for PPO2 - """ - ppo2_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, - seed=SEED, policy=policy, n_envs=NUM_CPU, - nminibatches=NUM_CPU, n_steps=16) - - -@pytest.mark.slow -def test_trpo(): - """ - test TRPO on atari - """ - trpo_atari.train(env_id=ENV_ID, num_timesteps=NUM_TIMESTEPS, seed=SEED) diff --git a/tests/test_auto_vec_detection.py b/tests/test_auto_vec_detection.py index 1796657f94..fbadad059f 100644 --- a/tests/test_auto_vec_detection.py +++ b/tests/test_auto_vec_detection.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from stable_baselines import A2C, ACER, ACKTR, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3 +# from stable_baselines import A2C, ACER, ACKTR, DDPG, DQN, PPO1, PPO2, SAC, TRPO, TD3 from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox, IdentityEnvMultiBinary, \ IdentityEnvMultiDiscrete @@ -21,7 +21,8 @@ def callback(locals_, _globals): @pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO]) +# @pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO]) +@pytest.mark.parametrize("model_class", []) def test_identity(model_class): """ test the Disrete environment vectorisation detection @@ -32,7 +33,8 @@ def test_identity(model_class): @pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, DDPG, PPO1, PPO2, SAC, TRPO, TD3]) +# @pytest.mark.parametrize("model_class", [A2C, DDPG, PPO1, PPO2, SAC, TRPO, TD3]) +@pytest.mark.parametrize("model_class", []) def test_identity_box(model_class): """ test the Box environment vectorisation detection @@ -43,7 +45,8 @@ def test_identity_box(model_class): @pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) +# @pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) +@pytest.mark.parametrize("model_class", []) def test_identity_multi_binary(model_class): """ test the MultiBinary environment vectorisation detection @@ -54,7 +57,8 @@ def test_identity_multi_binary(model_class): @pytest.mark.slow -@pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) +# @pytest.mark.parametrize("model_class", [A2C, PPO1, PPO2, TRPO]) +@pytest.mark.parametrize("model_class", []) def test_identity_multi_discrete(model_class): """ test the MultiDiscrete environment vectorisation detection diff --git a/tests/test_continuous.py b/tests/test_continuous.py index f1943e0ab9..eb39724080 100644 --- a/tests/test_continuous.py +++ b/tests/test_continuous.py @@ -5,12 +5,12 @@ import pytest import numpy as np -from stable_baselines import A2C, ACKTR, SAC, DDPG, PPO1, PPO2, TRPO, TD3 +# from stable_baselines import A2C, ACKTR, SAC, DDPG, PPO1, PPO2, TRPO, TD3 # TODO: add support for continuous actions # from stable_baselines.acer import ACER from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.identity_env import IdentityEnvBox -from stable_baselines.ddpg import AdaptiveParamNoiseSpec, NormalActionNoise +from stable_baselines.common.noise import AdaptiveParamNoiseSpec, NormalActionNoise from stable_baselines.common.evaluation import evaluate_policy from tests.test_common import _assert_eq @@ -18,17 +18,18 @@ N_EVAL_EPISODES = 20 NUM_TIMESTEPS = 15000 -MODEL_LIST = [ - A2C, - # ACER, - ACKTR, - DDPG, - PPO1, - PPO2, - SAC, - TD3, - TRPO -] +# MODEL_LIST = [ +# A2C, +# # ACER, +# ACKTR, +# DDPG, +# PPO1, +# PPO2, +# SAC, +# TD3, +# TRPO +# ] +MODEL_LIST = [] @pytest.mark.slow @@ -129,7 +130,7 @@ def test_model_manipulation(request, model_class): if os.path.exists(model_fname): os.remove(model_fname) - +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_ddpg(): args = ['--env-id', 'Pendulum-v0', '--num-timesteps', 1000, '--noise-type', 'ou_0.01'] args = list(map(str, args)) @@ -137,6 +138,7 @@ def test_ddpg(): _assert_eq(return_code, 0) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_ddpg_eval_env(): """ Additional test to check that everything is working when passing @@ -149,6 +151,7 @@ def test_ddpg_eval_env(): model.learn(1000) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_ddpg_normalization(): """ Test that observations and returns normalizations are properly saved and loaded. @@ -176,6 +179,7 @@ def test_ddpg_normalization(): os.remove("./test_ddpg.zip") +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_ddpg_popart(): """ Test DDPG with pop-art normalization diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py index 725d88ffeb..6232f08ec7 100644 --- a/tests/test_custom_policy.py +++ b/tests/test_custom_policy.py @@ -4,64 +4,66 @@ import pytest import tensorflow as tf -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO, SAC, DDPG -from stable_baselines.common.policies import FeedForwardPolicy -from stable_baselines.common.vec_env import DummyVecEnv -from stable_baselines.deepq.policies import FeedForwardPolicy as DQNPolicy -from stable_baselines.ddpg.policies import FeedForwardPolicy as DDPGPolicy -from stable_baselines.sac.policies import FeedForwardPolicy as SACPolicy - -N_TRIALS = 100 - - -class CustomCommonPolicy(FeedForwardPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'net_arch' not in kwargs: - kwargs['net_arch'] = [8, dict(vf=[8, 8], pi=[8, 8])] - super(CustomCommonPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomDQNPolicy(DQNPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomDQNPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomDDPGPolicy(DDPGPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomDDPGPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - - -class CustomSACPolicy(SACPolicy): - def __init__(self, *args, **kwargs): - # Default value - if 'layers' not in kwargs: - kwargs['layers'] = [8, 8] - super(CustomSACPolicy, self).__init__(*args, **kwargs, - feature_extraction="mlp") - +# from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO, SAC, DDPG +# from stable_baselines.common.policies import FeedForwardPolicy +# from stable_baselines.common.vec_env import DummyVecEnv +# from stable_baselines.deepq.policies import FeedForwardPolicy as DQNPolicy +# from stable_baselines.ddpg.policies import FeedForwardPolicy as DDPGPolicy +# from stable_baselines.sac.policies import FeedForwardPolicy as SACPolicy +# +# N_TRIALS = 100 +# +# +# class CustomCommonPolicy(FeedForwardPolicy): +# def __init__(self, *args, **kwargs): +# # Default value +# if 'net_arch' not in kwargs: +# kwargs['net_arch'] = [8, dict(vf=[8, 8], pi=[8, 8])] +# super(CustomCommonPolicy, self).__init__(*args, **kwargs, +# feature_extraction="mlp") +# +# +# class CustomDQNPolicy(DQNPolicy): +# def __init__(self, *args, **kwargs): +# # Default value +# if 'layers' not in kwargs: +# kwargs['layers'] = [8, 8] +# super(CustomDQNPolicy, self).__init__(*args, **kwargs, +# feature_extraction="mlp") +# +# +# class CustomDDPGPolicy(DDPGPolicy): +# def __init__(self, *args, **kwargs): +# # Default value +# if 'layers' not in kwargs: +# kwargs['layers'] = [8, 8] +# super(CustomDDPGPolicy, self).__init__(*args, **kwargs, +# feature_extraction="mlp") +# +# +# class CustomSACPolicy(SACPolicy): +# def __init__(self, *args, **kwargs): +# # Default value +# if 'layers' not in kwargs: +# kwargs['layers'] = [8, 8] +# super(CustomSACPolicy, self).__init__(*args, **kwargs, +# feature_extraction="mlp") +# # MODEL_CLASS, POLICY_CLASS, POLICY_KWARGS -MODEL_DICT = { - 'a2c': (A2C, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[12, dict(vf=[16], pi=[8])])), - 'acer': (ACER, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), - 'acktr': (ACKTR, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), - 'dqn': (DQN, CustomDQNPolicy, dict(layers=[4, 4], dueling=False)), - 'ddpg': (DDPG, CustomDDPGPolicy, dict(layers=[16, 16], layer_norm=False)), - 'ppo1': (PPO1, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[8, 4])), - 'ppo2': (PPO2, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[4, 4])), - 'sac': (SAC, CustomSACPolicy, dict(layers=[16, 16])), - 'trpo': (TRPO, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), -} +# MODEL_DICT = { +# 'a2c': (A2C, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[12, dict(vf=[16], pi=[8])])), +# 'acer': (ACER, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), +# 'acktr': (ACKTR, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), +# 'dqn': (DQN, CustomDQNPolicy, dict(layers=[4, 4], dueling=False)), +# 'ddpg': (DDPG, CustomDDPGPolicy, dict(layers=[16, 16], layer_norm=False)), +# 'ppo1': (PPO1, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[8, 4])), +# 'ppo2': (PPO2, CustomCommonPolicy, dict(act_fun=tf.nn.relu, net_arch=[4, 4])), +# 'sac': (SAC, CustomSACPolicy, dict(layers=[16, 16])), +# 'trpo': (TRPO, CustomCommonPolicy, dict(act_fun=tf.nn.relu)), +# } + +MODEL_DICT = {} @pytest.mark.parametrize("model_name", MODEL_DICT.keys()) diff --git a/tests/test_deepq.py b/tests/test_deepq.py deleted file mode 100644 index c15eba6867..0000000000 --- a/tests/test_deepq.py +++ /dev/null @@ -1,31 +0,0 @@ -from stable_baselines.deepq.experiments.custom_cartpole import main as main_custom -from stable_baselines.deepq.experiments.train_cartpole import main as train_cartpole -from stable_baselines.deepq.experiments.enjoy_cartpole import main as enjoy_cartpole -from stable_baselines.deepq.experiments.train_mountaincar import main as train_mountaincar -from stable_baselines.deepq.experiments.enjoy_mountaincar import main as enjoy_mountaincar - - -class DummyObject(object): - """ - Dummy object to create fake Parsed Arguments object - """ - pass - - -args = DummyObject() -args.no_render = True -args.max_timesteps = 200 - - -def test_custom_cartpole(): - main_custom(args) - - -def test_cartpole(): - train_cartpole(args) - enjoy_cartpole(args) - - -def test_mountaincar(): - train_mountaincar(args) - enjoy_mountaincar(args) diff --git a/tests/test_distri.py b/tests/test_distri.py index d3be362617..b8cfa0c484 100644 --- a/tests/test_distri.py +++ b/tests/test_distri.py @@ -1,13 +1,14 @@ +import pytest import numpy as np import tensorflow as tf -import stable_baselines.common.tf_util as tf_util -from stable_baselines.common.distributions import DiagGaussianProbabilityDistributionType,\ - CategoricalProbabilityDistributionType, \ - MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType +# import stable_baselines.common.tf_util as tf_util +# from stable_baselines.common.distributions import DiagGaussianProbabilityDistributionType,\ +# CategoricalProbabilityDistributionType, \ +# MultiCategoricalProbabilityDistributionType, BernoulliProbabilityDistributionType - -@tf_util.in_session +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") +# @tf_util.in_session def test_probtypes(): """ test probability distribution types @@ -32,6 +33,7 @@ def test_probtypes(): validate_probtype(bernoulli, pdparam_bernoulli) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def validate_probtype(probtype, pdparam): """ validate probability distribution types diff --git a/tests/test_envs.py b/tests/test_envs.py new file mode 100644 index 0000000000..818f7914e6 --- /dev/null +++ b/tests/test_envs.py @@ -0,0 +1,130 @@ +import pytest +import gym +from gym import spaces +import numpy as np + +from stable_baselines.common.env_checker import check_env +from stable_baselines.common.bit_flipping_env import BitFlippingEnv +from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox + + +@pytest.mark.parametrize("env_id", ['CartPole-v0', 'Pendulum-v0', 'BreakoutNoFrameskip-v4']) +def test_env(env_id): + """ + Check that environmnent integrated in Gym pass the test. + + :param env_id: (str) + """ + env = gym.make(env_id) + with pytest.warns(None) as record: + check_env(env) + + # Pendulum-v0 will produce a warning because the action space is + # in [-2, 2] and not [-1, 1] + if env_id == 'Pendulum-v0': + assert len(record) == 1 + else: + # The other environments must pass without warning + assert len(record) == 0 + + +@pytest.mark.parametrize("env_class", [IdentityEnv, IdentityEnvBox, BitFlippingEnv]) +def test_custom_envs(env_class): + env = env_class() + check_env(env) + + +@pytest.mark.parametrize("new_obs_space", [ + # Small image + spaces.Box(low=0, high=255, shape=(32, 32, 3), dtype=np.uint8), + # Range not in [0, 255] + spaces.Box(low=0, high=1, shape=(64, 64, 3), dtype=np.uint8), + # Wrong dtype + spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.float32), + # Not an image, it should be a 1D vector + spaces.Box(low=-1, high=1, shape=(64, 3), dtype=np.float32), + # Tuple space is not supported by SB + spaces.Tuple([spaces.Discrete(5), spaces.Discrete(10)]), + # Dict space is not supported by SB when env is not a GoalEnv + spaces.Dict({"position": spaces.Discrete(5)}), +]) +def test_non_default_spaces(new_obs_space): + env = gym.make('BreakoutNoFrameskip-v4') + env.observation_space = new_obs_space + # Patch methods to avoid errors + env.reset = new_obs_space.sample + + def patched_step(_action): + return new_obs_space.sample(), 0.0, False, {} + + env.step = patched_step + with pytest.warns(UserWarning): + check_env(env) + + +def check_reset_assert_error(env, new_reset_return): + """ + Helper to check that the error is caught. + :param env: (gym.Env) + :param new_reset_return: (Any) + """ + + def wrong_reset(): + return new_reset_return + + # Patch the reset method with a wrong one + env.reset = wrong_reset + with pytest.raises(AssertionError): + check_env(env) + + +def test_common_failures_reset(): + """ + Test that common failure cases of the `reset_method` are caught + """ + env = IdentityEnvBox() + # Return an observation that does not match the observation_space + check_reset_assert_error(env, np.ones((3,))) + # The observation is not a numpy array + check_reset_assert_error(env, 1) + + # Return not only the observation + check_reset_assert_error(env, (env.observation_space.sample(), False)) + + +def check_step_assert_error(env, new_step_return=()): + """ + Helper to check that the error is caught. + :param env: (gym.Env) + :param new_step_return: (tuple) + """ + + def wrong_step(_action): + return new_step_return + + # Patch the step method with a wrong one + env.step = wrong_step + with pytest.raises(AssertionError): + check_env(env) + + +def test_common_failures_step(): + """ + Test that common failure cases of the `step` method are caught + """ + env = IdentityEnvBox() + + # Wrong shape for the observation + check_step_assert_error(env, (np.ones((4,)), 1.0, False, {})) + # Obs is not a numpy array + check_step_assert_error(env, (1, 1.0, False, {})) + + # Return a wrong reward + check_step_assert_error(env, (env.observation_space.sample(), np.ones(1), False, {})) + + # Info dict is not returned + check_step_assert_error(env, (env.observation_space.sample(), 0.0, False)) + + # Done is not a boolean + check_step_assert_error(env, (env.observation_space.sample(), 0.0, 3.0, {})) + check_step_assert_error(env, (env.observation_space.sample(), 0.0, 1, {})) diff --git a/tests/test_gail.py b/tests/test_gail.py index 1bc98c90fc..bc9338c38d 100644 --- a/tests/test_gail.py +++ b/tests/test_gail.py @@ -4,20 +4,21 @@ import numpy as np import pytest -from stable_baselines import A2C, ACER, ACKTR, GAIL, DDPG, DQN, PPO1, PPO2,\ - TD3, TRPO, SAC +# from stable_baselines import A2C, ACER, ACKTR, GAIL, DDPG, DQN, PPO1, PPO2,\ +# TD3, TRPO, SAC from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines.gail import ExpertDataset, generate_expert_traj +# from stable_baselines.gail import ExpertDataset, generate_expert_traj EXPERT_PATH_PENDULUM = "stable_baselines/gail/dataset/expert_pendulum.npz" EXPERT_PATH_DISCRETE = "stable_baselines/gail/dataset/expert_cartpole.npz" -@pytest.mark.parametrize("expert_env", [('Pendulum-v0', EXPERT_PATH_PENDULUM, True), - ('CartPole-v1', EXPERT_PATH_DISCRETE, False)]) +@pytest.mark.parametrize("expert_env", []) +# @pytest.mark.parametrize("expert_env", [('Pendulum-v0', EXPERT_PATH_PENDULUM, True), +# ('CartPole-v1', EXPERT_PATH_DISCRETE, False)]) def test_gail(expert_env): env_id, expert_path, load_from_memory = expert_env env = gym.make(env_id) @@ -41,13 +42,15 @@ def test_gail(expert_env): evaluate_policy(model, env, n_eval_episodes=5) del dataset, model -@pytest.mark.parametrize("generate_env", [ - (SAC, 'MlpPolicy', 'Pendulum-v0', 1, 10), - (DQN, 'MlpPolicy', 'CartPole-v1', 1, 10), - (A2C, 'MlpLstmPolicy', 'Pendulum-v0', 1, 10), - (A2C, 'MlpLstmPolicy', 'CartPole-v1', 1, 10), - (A2C, 'CnnPolicy', 'BreakoutNoFrameskip-v4', 8, 1), - ]) + +@pytest.mark.parametrize("generate_env", []) +# @pytest.mark.parametrize("generate_env", [ +# (SAC, 'MlpPolicy', 'Pendulum-v0', 1, 10), +# (DQN, 'MlpPolicy', 'CartPole-v1', 1, 10), +# (A2C, 'MlpLstmPolicy', 'Pendulum-v0', 1, 10), +# (A2C, 'MlpLstmPolicy', 'CartPole-v1', 1, 10), +# (A2C, 'CnnPolicy', 'BreakoutNoFrameskip-v4', 8, 1), +# ]) def test_generate(generate_env): model, policy, env_name, n_env, n_episodes = generate_env @@ -74,6 +77,7 @@ def test_generate(generate_env): assert (dataset[key] == dataset_loaded[key]).all(), "different data at '{}'".format(key) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_generate_callable(): """ Test generating expert trajectories with a callable. @@ -85,6 +89,7 @@ def dummy_expert(_obs): generate_expert_traj(dummy_expert, 'dummy_expert_cartpole', env, n_timesteps=0, n_episodes=10) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_pretrain_images(): env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) env = VecFrameStack(env, n_stack=4) @@ -102,7 +107,8 @@ def test_pretrain_images(): del dataset, model, env -@pytest.mark.parametrize("model_class", [A2C, ACKTR, GAIL, DDPG, PPO1, PPO2, SAC, TD3, TRPO]) +# @pytest.mark.parametrize("model_class", [A2C, ACKTR, GAIL, DDPG, PPO1, PPO2, SAC, TD3, TRPO]) +@pytest.mark.parametrize("model_class", []) def test_behavior_cloning_box(model_class): """ Behavior cloning with continuous actions. @@ -115,7 +121,8 @@ def test_behavior_cloning_box(model_class): del dataset, model -@pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, GAIL, PPO1, PPO2, TRPO]) +# @pytest.mark.parametrize("model_class", [A2C, ACER, ACKTR, DQN, GAIL, PPO1, PPO2, TRPO]) +@pytest.mark.parametrize("model_class", []) def test_behavior_cloning_discrete(model_class): dataset = ExpertDataset(expert_path=EXPERT_PATH_DISCRETE, traj_limitation=10, sequential_preprocessing=True, verbose=0) @@ -125,6 +132,7 @@ def test_behavior_cloning_discrete(model_class): del dataset, model +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_dataset_param_validation(): with pytest.raises(ValueError): ExpertDataset() diff --git a/tests/test_her.py b/tests/test_her.py index 40c090c634..f0f6c0ad95 100644 --- a/tests/test_her.py +++ b/tests/test_her.py @@ -2,9 +2,9 @@ import pytest -from stable_baselines import HER, DQN, SAC, DDPG, TD3 -from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper -from stable_baselines.her.replay_buffer import KEY_TO_GOAL_STRATEGY +# from stable_baselines import HER, DQN, SAC, DDPG, TD3 +# from stable_baselines.her import GoalSelectionStrategy, HERGoalEnvWrapper +# from stable_baselines.her.replay_buffer import KEY_TO_GOAL_STRATEGY from stable_baselines.common.bit_flipping_env import BitFlippingEnv from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize @@ -31,8 +31,9 @@ def model_predict(model, env, n_steps, additional_check=None): obs = env.reset() -@pytest.mark.parametrize('goal_selection_strategy', list(GoalSelectionStrategy)) -@pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) +# @pytest.mark.parametrize('goal_selection_strategy', list(GoalSelectionStrategy)) +# @pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) +@pytest.mark.parametrize('model_class', []) @pytest.mark.parametrize('discrete_obs_space', [False, True]) def test_her(model_class, goal_selection_strategy, discrete_obs_space): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], @@ -45,7 +46,8 @@ def test_her(model_class, goal_selection_strategy, discrete_obs_space): model.learn(1000) -@pytest.mark.parametrize('model_class', [DDPG, SAC, DQN, TD3]) +# @pytest.mark.parametrize('model_class', [DDPG, SAC, DQN, TD3]) +@pytest.mark.parametrize('model_class', []) def test_long_episode(model_class): """ Check that the model does not break when the replay buffer is still empty @@ -67,8 +69,9 @@ def test_long_episode(model_class): model.learn(200) -@pytest.mark.parametrize('goal_selection_strategy', [list(KEY_TO_GOAL_STRATEGY.keys())[0]]) -@pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) +# @pytest.mark.parametrize('goal_selection_strategy', [list(KEY_TO_GOAL_STRATEGY.keys())[0]]) +# @pytest.mark.parametrize('model_class', [DQN, SAC, DDPG, TD3]) +@pytest.mark.parametrize('model_class', []) def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) diff --git a/tests/test_identity.py b/tests/test_identity.py index ded682e685..c0c93ad16a 100644 --- a/tests/test_identity.py +++ b/tests/test_identity.py @@ -1,34 +1,36 @@ import pytest import numpy as np -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO -from stable_baselines.ddpg import NormalActionNoise +# from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO +from stable_baselines.common.noise import NormalActionNoise from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy # Hyperparameters for learning identity for each RL model -LEARN_FUNC_DICT = { - 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, - gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), - 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, - n_steps=1, replay_ratio=1).learn(total_timesteps=15000), - 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, - learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), - 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, - exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), - 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, - optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), - 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, - learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), - 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, - max_kl=0.05, lam=0.7).learn(total_timesteps=10000), -} +# LEARN_FUNC_DICT = { +# 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, +# gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), +# 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, +# n_steps=1, replay_ratio=1).learn(total_timesteps=15000), +# 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, +# learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), +# 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, +# exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), +# 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, +# optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), +# 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, +# learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), +# 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, +# max_kl=0.05, lam=0.7).learn(total_timesteps=10000), +# } +LEARN_FUNC_DICT = {} @pytest.mark.slow -@pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) +# @pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) +@pytest.mark.parametrize("model_name", []) def test_identity(model_name): """ Test if the algorithm (with a given policy) @@ -54,7 +56,8 @@ def test_identity(model_name): @pytest.mark.slow -@pytest.mark.parametrize("model_class", [DDPG, TD3, SAC]) +# @pytest.mark.parametrize("model_class", [DDPG, TD3, SAC]) +@pytest.mark.parametrize("model_class", []) def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) diff --git a/tests/test_load_parameters.py b/tests/test_load_parameters.py index bb294d7a16..cfac3d791e 100644 --- a/tests/test_load_parameters.py +++ b/tests/test_load_parameters.py @@ -4,19 +4,21 @@ import pytest import numpy as np -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO +# from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO from stable_baselines.common.identity_env import IdentityEnv from stable_baselines.common.vec_env import DummyVecEnv -MODEL_LIST = [ - A2C, - ACER, - ACKTR, - DQN, - PPO1, - PPO2, - TRPO, -] +# MODEL_LIST = [ +# A2C, +# ACER, +# ACKTR, +# DQN, +# PPO1, +# PPO2, +# TRPO, +# ] + +MODEL_LIST = [] @pytest.mark.parametrize("model_class", MODEL_LIST) def test_load_parameters(request, model_class): diff --git a/tests/test_logger.py b/tests/test_logger.py index 3b9ab56904..530d683fe7 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -24,7 +24,9 @@ def test_main(): _demo() -@pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv']) +# @pytest.mark.parametrize('_format', ['tensorboard', 'stdout', 'log', 'json', 'csv']) +@pytest.mark.parametrize('_format', ['stdout', 'log', 'json', 'csv']) +# @pytest.mark.parametrize('mpi_disabled', [False, True]) @pytest.mark.parametrize('mpi_disabled', [False, True]) def test_make_output(_format, mpi_disabled): """ diff --git a/tests/test_lstm_policy.py b/tests/test_lstm_policy.py index 63e65e4df0..56b5a142cd 100644 --- a/tests/test_lstm_policy.py +++ b/tests/test_lstm_policy.py @@ -6,37 +6,37 @@ import numpy as np import pytest -from stable_baselines import A2C, ACER, ACKTR, PPO2, bench -from stable_baselines.common.policies import MlpLstmPolicy, LstmPolicy -from stable_baselines.common.vec_env import SubprocVecEnv -from stable_baselines.common.vec_env.vec_normalize import VecNormalize -from stable_baselines.ppo2.ppo2 import safe_mean -from stable_baselines.common.evaluation import evaluate_policy - - -class CustomLSTMPolicy1(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[8, 'lstm', 8], - layer_norm=False, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy2(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=['lstm', 8], layer_norm=True, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy3(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=[8, 'lstm'], layer_norm=False, feature_extraction="mlp", **_kwargs) - - -class CustomLSTMPolicy4(LstmPolicy): - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): - super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, - net_arch=[8, 'lstm', dict(vf=[5, 10], pi=[10])], - layer_norm=True, feature_extraction="mlp", **_kwargs) +# from stable_baselines import A2C, ACER, ACKTR, PPO2, bench +# from stable_baselines.common.policies import MlpLstmPolicy, LstmPolicy +# from stable_baselines.common.vec_env import SubprocVecEnv +# from stable_baselines.common.vec_env.vec_normalize import VecNormalize +# from stable_baselines.ppo2.ppo2 import safe_mean +# from stable_baselines.common.evaluation import evaluate_policy +# +# +# class CustomLSTMPolicy1(LstmPolicy): +# def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=128, reuse=False, **_kwargs): +# super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[8, 'lstm', 8], +# layer_norm=False, feature_extraction="mlp", **_kwargs) +# +# +# class CustomLSTMPolicy2(LstmPolicy): +# def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): +# super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, +# net_arch=['lstm', 8], layer_norm=True, feature_extraction="mlp", **_kwargs) +# +# +# class CustomLSTMPolicy3(LstmPolicy): +# def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): +# super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, +# net_arch=[8, 'lstm'], layer_norm=False, feature_extraction="mlp", **_kwargs) +# +# +# class CustomLSTMPolicy4(LstmPolicy): +# def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=64, reuse=False, **_kwargs): +# super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, +# net_arch=[8, 'lstm', dict(vf=[5, 10], pi=[10])], +# layer_norm=True, feature_extraction="mlp", **_kwargs) class CartPoleNoVelEnv(CartPoleEnv): @@ -68,8 +68,10 @@ def step(self, action): NUM_ENVS = 16 NUM_EPISODES_FOR_SCORE = 10 -MODELS = [A2C, ACER, ACKTR, PPO2] -LSTM_POLICIES = [MlpLstmPolicy, CustomLSTMPolicy1, CustomLSTMPolicy2, CustomLSTMPolicy3, CustomLSTMPolicy4] +# MODELS = [A2C, ACER, ACKTR, PPO2] +# LSTM_POLICIES = [MlpLstmPolicy, CustomLSTMPolicy1, CustomLSTMPolicy2, CustomLSTMPolicy3, CustomLSTMPolicy4] +MODELS = [] +LSTM_POLICIES = [] @pytest.mark.parametrize("model_class", MODELS) diff --git a/tests/test_math_util.py b/tests/test_math_util.py index aac4107a93..584ba98d47 100644 --- a/tests/test_math_util.py +++ b/tests/test_math_util.py @@ -1,6 +1,8 @@ +import tensorflow as tf import numpy as np +from gym.spaces.box import Box -from stable_baselines.common.math_util import discount_with_boundaries +from stable_baselines.common.math_util import discount_with_boundaries, scale_action, unscale_action def test_discount_with_boundaries(): @@ -13,3 +15,67 @@ def test_discount_with_boundaries(): discounted_rewards = discount_with_boundaries(rewards, episode_starts, gamma) assert np.allclose(discounted_rewards, [1 + gamma * 2 + gamma ** 2 * 3, 2 + gamma * 3, 3, 4]) return + + +def test_scaling_action(): + """ + test scaling of scalar, 1d and 2d vectors of finite non-NaN real numbers to and from tanh co-domain (per component) + """ + test_ranges = [(-1, 1), (-10, 10), (-10, 5), (-10, 0), (-10, -5), (0, 10), (5, 10)] + + # scalars + for (range_low, range_high) in test_ranges: + check_scaled_actions_from_range(range_low, range_high, scalar=True) + + # 1d vectors: wrapped scalars + for test_range in test_ranges: + check_scaled_actions_from_range(*test_range) + + # 2d vectors: all combinations of ranges above + for (r1_low, r1_high) in test_ranges: + for (r2_low, r2_high) in test_ranges: + check_scaled_actions_from_range(np.array([r1_low, r2_low], dtype=np.float), + np.array([r1_high, r2_high], dtype=np.float)) + + +def check_scaled_actions_from_range(low, high, scalar=False): + """ + helper method which creates dummy action space spanning between respective components of low and high + and then checks scaling to and from tanh co-domain for low, middle and high value from that action space + :param low: (np.ndarray), (int) or (float) + :param high: (np.ndarray), (int) or (float) + :param scalar: (bool) Whether consider scalar range or wrap it into 1d vector + """ + + if scalar and (isinstance(low, float) or isinstance(low, int)): + ones = 1. + action_space = Box(low, high, shape=(1,)) + else: + low = np.atleast_1d(low) + high = np.atleast_1d(high) + ones = np.ones_like(low) + action_space = Box(low, high) + + mid = 0.5 * (low + high) + + expected_mapping = [(low, -ones), (mid, 0. * ones), (high, ones)] + + for (not_scaled, scaled) in expected_mapping: + assert np.allclose(scale_action(action_space, not_scaled), scaled) + assert np.allclose(unscale_action(action_space, scaled), not_scaled) + + +def test_batch_shape_invariant_to_scaling(): + """ + test that scaling deals well with batches as tensors and numpy matrices in terms of shape + """ + action_space = Box(np.array([-10., -5., -1.]), np.array([10., 3., 2.])) + + tensor = tf.constant(1., shape=[2, 3]) + matrix = np.ones((2, 3)) + + assert scale_action(action_space, tensor).shape == (2, 3) + assert scale_action(action_space, matrix).shape == (2, 3) + + assert unscale_action(action_space, tensor).shape == (2, 3) + assert unscale_action(action_space, matrix).shape == (2, 3) diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 42408a523a..90a915afc1 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -6,6 +6,7 @@ import gym from stable_baselines.bench import Monitor +from stable_baselines.bench.monitor import get_monitor_files, load_results def test_monitor(): @@ -34,3 +35,52 @@ def test_monitor(): assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" file_handler.close() os.remove(mon_file) + +def test_monitor_load_results(tmp_path): + """ + test load_results on log files produced by the monitor wrapper + """ + tmp_path = str(tmp_path) + env1 = gym.make("CartPole-v1") + env1.seed(0) + monitor_file1 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) + monitor_env1 = Monitor(env1, monitor_file1) + + monitor_files = get_monitor_files(tmp_path) + assert len(monitor_files) == 1 + assert monitor_file1 in monitor_files + + monitor_env1.reset() + episode_count1 = 0 + for _ in range(1000): + _, _, done, _ = monitor_env1.step(monitor_env1.action_space.sample()) + if done: + episode_count1 += 1 + monitor_env1.reset() + + results_size1 = len(load_results(os.path.join(tmp_path)).index) + assert results_size1 == episode_count1 + + env2 = gym.make("CartPole-v1") + env2.seed(0) + monitor_file2 = os.path.join(tmp_path, "stable_baselines-test-{}.monitor.csv".format(uuid.uuid4())) + monitor_env2 = Monitor(env2, monitor_file2) + monitor_files = get_monitor_files(tmp_path) + assert len(monitor_files) == 2 + assert monitor_file1 in monitor_files + assert monitor_file2 in monitor_files + + monitor_env2.reset() + episode_count2 = 0 + for _ in range(1000): + _, _, done, _ = monitor_env2.step(monitor_env2.action_space.sample()) + if done: + episode_count2 += 1 + monitor_env2.reset() + + results_size2 = len(load_results(os.path.join(tmp_path)).index) + + assert results_size2 == (results_size1 + episode_count2) + + os.remove(monitor_file1) + os.remove(monitor_file2) diff --git a/tests/test_mpi_adam.py b/tests/test_mpi_adam.py index 73dc9fb77e..608df0b2ac 100644 --- a/tests/test_mpi_adam.py +++ b/tests/test_mpi_adam.py @@ -1,8 +1,11 @@ import subprocess +import pytest + from .test_common import _assert_eq +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_mpi_adam(): """Test RunningMeanStd object for MPI""" return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', @@ -10,6 +13,7 @@ def test_mpi_adam(): _assert_eq(return_code, 0) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_mpi_adam_ppo1(): """Running test for ppo1""" return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2', diff --git a/tests/test_ppo2.py b/tests/test_ppo2.py index 43c2a88d0c..a8a3e4ff4a 100644 --- a/tests/test_ppo2.py +++ b/tests/test_ppo2.py @@ -2,9 +2,10 @@ import pytest -from stable_baselines import PPO2 +# from stable_baselines import PPO2 +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") @pytest.mark.parametrize("cliprange", [0.2, lambda x: 0.1 * x]) @pytest.mark.parametrize("cliprange_vf", [None, 0.2, lambda x: 0.3 * x, -1.0]) def test_clipping(cliprange, cliprange_vf): diff --git a/tests/test_save.py b/tests/test_save.py index 7be7edd121..494e9448a1 100644 --- a/tests/test_save.py +++ b/tests/test_save.py @@ -6,23 +6,24 @@ import pytest import numpy as np -from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO +# from stable_baselines import A2C, ACER, ACKTR, DQN, PPO1, PPO2, TRPO from stable_baselines.common.identity_env import IdentityEnv from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy -from stable_baselines.common.policies import MlpPolicy, FeedForwardPolicy +# from stable_baselines.common.policies import MlpPolicy, FeedForwardPolicy N_EVAL_EPISODES = 100 -MODEL_LIST = [ - A2C, - ACER, - ACKTR, - DQN, - PPO1, - PPO2, - TRPO, -] +# MODEL_LIST = [ +# A2C, +# ACER, +# ACKTR, +# DQN, +# PPO1, +# PPO2, +# TRPO, +# ] +MODEL_LIST = [] STORE_METHODS = [ "path", @@ -35,6 +36,7 @@ ] @pytest.mark.slow +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") @pytest.mark.parametrize("model_class", MODEL_LIST) @pytest.mark.parametrize("storage_method", STORE_METHODS) @pytest.mark.parametrize("store_format", STORE_FORMAT) @@ -124,12 +126,12 @@ def test_model_manipulation(request, model_class, storage_method, store_format): if os.path.exists(model_fname): os.remove(model_fname) -class CustomMlpPolicy(FeedForwardPolicy): - """A dummy "custom" policy to test out custom_objects""" - def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): - super(CustomMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, - n_batch, reuse, feature_extraction="mlp", - **_kwargs) +# class CustomMlpPolicy(FeedForwardPolicy): +# """A dummy "custom" policy to test out custom_objects""" +# def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs): +# super(CustomMlpPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, +# n_batch, reuse, feature_extraction="mlp", +# **_kwargs) @pytest.mark.parametrize("model_class", MODEL_LIST) diff --git a/tests/test_tensorboard.py b/tests/test_tensorboard.py index a4b675268a..4f02dcb57e 100644 --- a/tests/test_tensorboard.py +++ b/tests/test_tensorboard.py @@ -3,25 +3,27 @@ import pytest -from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TD3, TRPO +# from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TD3, TRPO TENSORBOARD_DIR = '/tmp/tb_dir/' if os.path.isdir(TENSORBOARD_DIR): shutil.rmtree(TENSORBOARD_DIR) -MODEL_DICT = { - 'a2c': (A2C, 'CartPole-v1'), - 'acer': (ACER, 'CartPole-v1'), - 'acktr': (ACKTR, 'CartPole-v1'), - 'dqn': (DQN, 'CartPole-v1'), - 'ddpg': (DDPG, 'Pendulum-v0'), - 'ppo1': (PPO1, 'CartPole-v1'), - 'ppo2': (PPO2, 'CartPole-v1'), - 'sac': (SAC, 'Pendulum-v0'), - 'td3': (TD3, 'Pendulum-v0'), - 'trpo': (TRPO, 'CartPole-v1'), -} +# MODEL_DICT = { +# 'a2c': (A2C, 'CartPole-v1'), +# 'acer': (ACER, 'CartPole-v1'), +# 'acktr': (ACKTR, 'CartPole-v1'), +# 'dqn': (DQN, 'CartPole-v1'), +# 'ddpg': (DDPG, 'Pendulum-v0'), +# 'ppo1': (PPO1, 'CartPole-v1'), +# 'ppo2': (PPO2, 'CartPole-v1'), +# 'sac': (SAC, 'Pendulum-v0'), +# 'td3': (TD3, 'Pendulum-v0'), +# 'trpo': (TRPO, 'CartPole-v1'), +# } + +MODEL_DICT = {} N_STEPS = 1000 diff --git a/tests/test_tf_util.py b/tests/test_tf_util.py index d71374da03..22fb0bcd04 100644 --- a/tests/test_tf_util.py +++ b/tests/test_tf_util.py @@ -1,10 +1,12 @@ # tests for tf_util +import pytest import numpy as np import tensorflow as tf from stable_baselines.common.tf_util import function, initialize, single_threaded_session, is_image +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_function(): """ test the function function in tf_util @@ -22,6 +24,7 @@ def test_function(): assert linear_fn(2, 2) == 10 +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_multikwargs(): """ test the function function in tf_util @@ -38,7 +41,8 @@ def test_multikwargs(): assert linear_fn(2) == 6 assert linear_fn(2, 2) == 10 - + +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_image_detection(): rgb = (32, 64, 3) gray = (43, 23, 1) diff --git a/tests/test_utils.py b/tests/test_utils.py index d277471137..6febf1239c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -4,7 +4,7 @@ import pytest import gym -from stable_baselines import A2C +# from stable_baselines import A2C from stable_baselines.bench.monitor import Monitor from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.cmd_util import make_vec_env @@ -56,6 +56,7 @@ def test_custom_vec_env(): make_vec_env('CartPole-v1', n_envs=1, vec_env_kwargs={'dummy': False}) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_evaluate_policy(): model = A2C('MlpPolicy', 'Pendulum-v0', seed=0) n_steps_per_episode, n_eval_episodes = 200, 2 diff --git a/tests/test_vec_normalize.py b/tests/test_vec_normalize.py index 9c70482f4d..e921d2eed7 100644 --- a/tests/test_vec_normalize.py +++ b/tests/test_vec_normalize.py @@ -1,5 +1,6 @@ import subprocess +import pytest import gym import numpy as np @@ -15,6 +16,7 @@ def make_env(): return gym.make(ENV_ID) +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ @@ -76,6 +78,55 @@ def test_vec_env(tmpdir): check_vec_norm_equal(norm_venv, deserialized) +def _make_warmstart_cartpole(): + """Warm-start VecNormalize by stepping through CartPole""" + venv = DummyVecEnv([lambda: gym.make("CartPole-v1")]) + venv = VecNormalize(venv) + venv.reset() + venv.get_original_obs() + + for _ in range(100): + actions = [venv.action_space.sample()] + venv.step(actions) + return venv + + +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") +def test_get_original(): + venv = _make_warmstart_cartpole() + for _ in range(3): + actions = [venv.action_space.sample()] + obs, rewards, _, _ = venv.step(actions) + obs = obs[0] + orig_obs = venv.get_original_obs()[0] + rewards = rewards[0] + orig_rewards = venv.get_original_reward()[0] + + assert np.all(orig_rewards == 1) + assert orig_obs.shape == obs.shape + assert orig_rewards.dtype == rewards.dtype + assert not np.array_equal(orig_obs, obs) + assert not np.array_equal(orig_rewards, rewards) + np.testing.assert_allclose(venv.normalize_obs(orig_obs), obs) + np.testing.assert_allclose(venv.normalize_reward(orig_rewards), rewards) + + +@pytest.mark.skip(reason="Not supported yet, tf2 migration in progress") +def test_normalize_external(): + venv = _make_warmstart_cartpole() + + rewards = np.array([1, 1]) + norm_rewards = venv.normalize_reward(rewards) + assert norm_rewards.shape == rewards.shape + # Episode return is almost always >= 1 in CartPole. So reward should shrink. + assert np.all(norm_rewards < 1) + + # Don't have any guarantees on obs normalization, except shape, really. + obs = np.array([0, 0, 0, 0]) + norm_obs = venv.normalize_obs(obs) + assert obs.shape == norm_obs.shape + + def test_mpi_runningmeanstd(): """Test RunningMeanStd object for MPI""" return_code = subprocess.call(['mpirun', '--allow-run-as-root', '-np', '2',