From ef50b5b5e018e7c6e751ce1a4b89f60fbf651a47 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Thu, 11 Jun 2020 22:04:02 +0300
Subject: [PATCH 01/44] DQN for flow

---
 .../multiagent_traffic_light_grid.py          | 10 ++---
 examples/train.py                             | 40 ++++++++++---------
 flow/envs/multiagent/traffic_light_grid.py    |  4 +-
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index b8293f638..5cfea98b2 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -1,6 +1,6 @@
 """Multi-agent traffic light example (single shared policy)."""
 
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_policy import DQNTFPolicy
 from flow.envs.multiagent import MultiTrafficLightGridPOEnv
 from flow.networks import TrafficLightGridNetwork
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams
@@ -88,7 +88,7 @@
             "target_velocity": 50,
             "switch_time": 3,
             "num_observed": 2,
-            "discrete": False,
+            "discrete": True,
             "tl_type": "actuated",
             "num_local_edges": 4,
             "num_local_lights": 4,
@@ -140,8 +140,8 @@
 
 
 def gen_policy():
-    """Generate a policy in RLlib."""
-    return PPOTFPolicy, obs_space, act_space, {}
+    """Generate a policy in DQN."""
+    return DQNTFPolicy, obs_space, act_space, {}
 
 
 # Setup PG with a single policy graph for all agents
@@ -149,7 +149,7 @@ def gen_policy():
 
 
 def policy_mapping_fn(_):
-    """Map a policy in RLlib."""
+    """Map a policy in DQN."""
     return 'av'
 
 
diff --git a/examples/train.py b/examples/train.py
index 1b2f22476..47f2fd93f 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -40,8 +40,8 @@ def parse_args(args):
 
     # optional input parameters
     parser.add_argument(
-        '--rl_trainer', type=str, default="rllib",
-        help='the RL trainer to use. either rllib or Stable-Baselines')
+        '--rl_trainer', type=str, default="DQN",
+        help='the RL trainer to use. DQN')
 
     parser.add_argument(
         '--num_cpus', type=int, default=1,
@@ -98,13 +98,13 @@ def run_model_stablebaseline(flow_params,
     return train_model
 
 
-def setup_exps_rllib(flow_params,
+def setup_exps_dqn(flow_params,
                      n_cpus,
                      n_rollouts,
                      policy_graphs=None,
                      policy_mapping_fn=None,
                      policies_to_train=None):
-    """Return the relevant components of an RLlib experiment.
+    """Return the relevant components of an DQN experiment.
 
     Parameters
     ----------
@@ -139,20 +139,22 @@ def setup_exps_rllib(flow_params,
 
     horizon = flow_params['env'].horizon
 
-    alg_run = "PPO"
+    alg_run = "DQN"
 
     agent_cls = get_agent_class(alg_run)
     config = deepcopy(agent_cls._default_config)
 
     config["num_workers"] = n_cpus
     config["train_batch_size"] = horizon * n_rollouts
-    config["gamma"] = 0.999  # discount rate
-    config["model"].update({"fcnet_hiddens": [32, 32, 32]})
-    config["use_gae"] = True
-    config["lambda"] = 0.97
-    config["kl_target"] = 0.02
-    config["num_sgd_iter"] = 10
+    config['clip_actions'] = False 
     config["horizon"] = horizon
+    config["timesteps_per_iteration"] = horizon * n_rollouts
+    config["hiddens"] = [512]
+    config["lr"] = 0.0000625  # TODO: hp tune
+    config["grad_norm_clipping"] = 40  # TODO: hp tune
+    config["schedule_max_timesteps"] = 2000000  # TODO: maybe try 5e5, 1e6
+    config["buffer_size"] = 1000000  # TODO: maybe try 1e5, 5e5
+    config["target_network_update_freq"] = 8000  # TODO: this is too small
 
     # save the flow params for replay
     flow_json = json.dumps(
@@ -177,8 +179,8 @@ def setup_exps_rllib(flow_params,
     return alg_run, gym_name, config
 
 
-def train_rllib(submodule, flags):
-    """Train policies using the PPO algorithm in RLlib."""
+def train_DQN(submodule, flags):
+    """Train policies using the DQN algorithm in DQN."""
     import ray
     from ray.tune import run_experiments
 
@@ -189,7 +191,7 @@ def train_rllib(submodule, flags):
     policy_mapping_fn = getattr(submodule, "policy_mapping_fn", None)
     policies_to_train = getattr(submodule, "policies_to_train", None)
 
-    alg_run, gym_name, config = setup_exps_rllib(
+    alg_run, gym_name, config = setup_exps_dqn(
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
@@ -379,24 +381,24 @@ def main(args):
         multiagent = False
     elif hasattr(module_ma, flags.exp_config):
         submodule = getattr(module_ma, flags.exp_config)
-        assert flags.rl_trainer.lower() in ["rllib", "h-baselines"], \
+        assert flags.rl_trainer.lower() in ["dqn", "h-baselines"], \
             "Currently, multiagent experiments are only supported through "\
-            "RLlib. Try running this experiment using RLlib: " \
+            "DQN. Try running this experiment using DQN: " \
             "'python train.py EXP_CONFIG'"
         multiagent = True
     else:
         raise ValueError("Unable to find experiment config.")
 
     # Perform the training operation.
-    if flags.rl_trainer.lower() == "rllib":
-        train_rllib(submodule, flags)
+    if flags.rl_trainer.lower() == "dqn":
+        train_dqn(submodule, flags)
     elif flags.rl_trainer.lower() == "stable-baselines":
         train_stable_baselines(submodule, flags)
     elif flags.rl_trainer.lower() == "h-baselines":
         flow_params = submodule.flow_params
         train_h_baselines(flow_params, args, multiagent)
     else:
-        raise ValueError("rl_trainer should be either 'rllib', 'h-baselines', "
+        raise ValueError("rl_trainer should be either 'dqn', 'h-baselines', "
                          "or 'stable-baselines'.")
 
 
diff --git a/flow/envs/multiagent/traffic_light_grid.py b/flow/envs/multiagent/traffic_light_grid.py
index a0438f828..0f4ad7bb3 100644
--- a/flow/envs/multiagent/traffic_light_grid.py
+++ b/flow/envs/multiagent/traffic_light_grid.py
@@ -79,7 +79,7 @@ def observation_space(self):
     def action_space(self):
         """See class definition."""
         if self.discrete:
-            return Discrete(2)
+            return Discrete(3)
         else:
             return Box(
                 low=-1,
@@ -208,7 +208,7 @@ def _apply_rl_actions(self, rl_actions):
         for rl_id, rl_action in rl_actions.items():
             i = int(rl_id.split("center")[ID_IDX])
             if self.discrete:
-                raise NotImplementedError
+                action = rl_action
             else:
                 # convert values less than 0.0 to zero and above to 1. 0's
                 # indicate that we should not switch the direction

From c5c5ed251eaac8294859e2ba5f4f11c971fcdbd3 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:21:10 +0300
Subject: [PATCH 02/44] correct typo

---
 examples/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index 47f2fd93f..632c6c70a 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -179,7 +179,7 @@ def setup_exps_dqn(flow_params,
     return alg_run, gym_name, config
 
 
-def train_DQN(submodule, flags):
+def train_dqn(submodule, flags):
     """Train policies using the DQN algorithm in DQN."""
     import ray
     from ray.tune import run_experiments

From 74cbd10aa513f959a7fc281c09ddd4de39aac215 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:25:21 +0300
Subject: [PATCH 03/44] rm trailing space

---
 examples/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index 632c6c70a..5ba36f4ff 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -146,7 +146,7 @@ def setup_exps_dqn(flow_params,
 
     config["num_workers"] = n_cpus
     config["train_batch_size"] = horizon * n_rollouts
-    config['clip_actions'] = False 
+    config['clip_actions'] = False
     config["horizon"] = horizon
     config["timesteps_per_iteration"] = horizon * n_rollouts
     config["hiddens"] = [512]

From 0829e9e59222efe7d6071343c80c39a07f00bc6a Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:31:48 +0300
Subject: [PATCH 04/44] add reference for dqn parameters setup

---
 examples/train.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 5ba36f4ff..27266b65a 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -149,12 +149,12 @@ def setup_exps_dqn(flow_params,
     config['clip_actions'] = False
     config["horizon"] = horizon
     config["timesteps_per_iteration"] = horizon * n_rollouts
-    config["hiddens"] = [512]
-    config["lr"] = 0.0000625  # TODO: hp tune
-    config["grad_norm_clipping"] = 40  # TODO: hp tune
-    config["schedule_max_timesteps"] = 2000000  # TODO: maybe try 5e5, 1e6
-    config["buffer_size"] = 1000000  # TODO: maybe try 1e5, 5e5
-    config["target_network_update_freq"] = 8000  # TODO: this is too small
+    #https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
+    config["hiddens"] = [512]    
+    config["lr"] = 0.0000625
+    config["schedule_max_timesteps"] = 2000000
+    config["buffer_size"] = 1000000
+    config["target_network_update_freq"] = 8000
 
     # save the flow params for replay
     flow_json = json.dumps(

From 69236de0f942e380844587a1b7decb3f630614a1 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:35:02 +0300
Subject: [PATCH 05/44] Update train.py

---
 examples/train.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 27266b65a..307ff2960 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -99,11 +99,11 @@ def run_model_stablebaseline(flow_params,
 
 
 def setup_exps_dqn(flow_params,
-                     n_cpus,
-                     n_rollouts,
-                     policy_graphs=None,
-                     policy_mapping_fn=None,
-                     policies_to_train=None):
+                   n_cpus,
+                   n_rollouts,
+                   policy_graphs=None,
+                   policy_mapping_fn=None,
+                   policies_to_train=None):
     """Return the relevant components of an DQN experiment.
 
     Parameters

From b59728ee84926e79cf2b3410d326a39f6d2a61cd Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:35:50 +0300
Subject: [PATCH 06/44] Update train.py

---
 examples/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index 307ff2960..cae58d562 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -150,7 +150,7 @@ def setup_exps_dqn(flow_params,
     config["horizon"] = horizon
     config["timesteps_per_iteration"] = horizon * n_rollouts
     #https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
-    config["hiddens"] = [512]    
+    config["hiddens"] = [512]
     config["lr"] = 0.0000625
     config["schedule_max_timesteps"] = 2000000
     config["buffer_size"] = 1000000

From d66f1ce149477d6a45e1e80518b7ef64ee5eecb2 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 22:40:00 +0300
Subject: [PATCH 07/44] Update train.py

---
 examples/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index cae58d562..1f0a1d9f3 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -149,7 +149,7 @@ def setup_exps_dqn(flow_params,
     config['clip_actions'] = False
     config["horizon"] = horizon
     config["timesteps_per_iteration"] = horizon * n_rollouts
-    #https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
+    # https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
     config["hiddens"] = [512]
     config["lr"] = 0.0000625
     config["schedule_max_timesteps"] = 2000000

From b2628e24cc9a28883f5d03886aade645e2c1c619 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 23:03:58 +0300
Subject: [PATCH 08/44] fix import

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 5cfea98b2..a35041bdd 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -1,6 +1,6 @@
 """Multi-agent traffic light example (single shared policy)."""
 
-from ray.rllib.agents.ppo.ppo_policy import DQNTFPolicy
+from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from flow.envs.multiagent import MultiTrafficLightGridPOEnv
 from flow.networks import TrafficLightGridNetwork
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams

From 72f9fca127eb921128aefd26938030e6f942a78a Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Thu, 11 Jun 2020 23:31:13 +0300
Subject: [PATCH 09/44] add rllib back to avoid test error

---
 examples/train.py | 129 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 123 insertions(+), 6 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 1f0a1d9f3..82a53f05a 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -1,6 +1,6 @@
 """Runner script for single and multi-agent reinforcement learning experiments.
 
-This script performs an RL experiment using the PPO algorithm. Choice of
+This script performs an RL experiment using the DQN algorithm. Choice of
 hyperparameters can be seen and adjusted from the code below.
 
 Usage
@@ -40,8 +40,8 @@ def parse_args(args):
 
     # optional input parameters
     parser.add_argument(
-        '--rl_trainer', type=str, default="DQN",
-        help='the RL trainer to use. DQN')
+        '--rl_trainer', type=str, default="dqn",
+        help='the RL trainer to use. either dqn or rllib or Stable-Baselines')
 
     parser.add_argument(
         '--num_cpus', type=int, default=1,
@@ -98,6 +98,121 @@ def run_model_stablebaseline(flow_params,
     return train_model
 
 
+def setup_exps_rllib(flow_params,
+                     n_cpus,
+                     n_rollouts,
+                     policy_graphs=None,
+                     policy_mapping_fn=None,
+                     policies_to_train=None):
+    """Return the relevant components of an RLlib experiment.
+
+    Parameters
+    ----------
+    flow_params : dict
+        flow-specific parameters (see flow/utils/registry.py)
+    n_cpus : int
+        number of CPUs to run the experiment over
+    n_rollouts : int
+        number of rollouts per training iteration
+    policy_graphs : dict, optional
+        TODO
+    policy_mapping_fn : function, optional
+        TODO
+    policies_to_train : list of str, optional
+        TODO
+
+    Returns
+    -------
+    str
+        name of the training algorithm
+    str
+        name of the gym environment to be trained
+    dict
+        training configuration parameters
+    """
+    from ray import tune
+    from ray.tune.registry import register_env
+    try:
+        from ray.rllib.agents.agent import get_agent_class
+    except ImportError:
+        from ray.rllib.agents.registry import get_agent_class
+
+    horizon = flow_params['env'].horizon
+
+    alg_run = "PPO"
+
+    agent_cls = get_agent_class(alg_run)
+    config = deepcopy(agent_cls._default_config)
+
+    config["num_workers"] = n_cpus
+    config["train_batch_size"] = horizon * n_rollouts
+    config["gamma"] = 0.999  # discount rate
+    config["model"].update({"fcnet_hiddens": [32, 32, 32]})
+    config["use_gae"] = True
+    config["lambda"] = 0.97
+    config["kl_target"] = 0.02
+    config["num_sgd_iter"] = 10
+    config["horizon"] = horizon
+
+    # save the flow params for replay
+    flow_json = json.dumps(
+        flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
+    config['env_config']['flow_params'] = flow_json
+    config['env_config']['run'] = alg_run
+
+    # multiagent configuration
+    if policy_graphs is not None:
+        print("policy_graphs", policy_graphs)
+        config['multiagent'].update({'policies': policy_graphs})
+    if policy_mapping_fn is not None:
+        config['multiagent'].update(
+            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
+    if policies_to_train is not None:
+        config['multiagent'].update({'policies_to_train': policies_to_train})
+
+    create_env, gym_name = make_create_env(params=flow_params)
+
+    # Register as rllib env
+    register_env(gym_name, create_env)
+    return alg_run, gym_name, config
+
+
+def train_rllib(submodule, flags):
+    """Train policies using the PPO algorithm in RLlib."""
+    import ray
+    from ray.tune import run_experiments
+
+    flow_params = submodule.flow_params
+    n_cpus = submodule.N_CPUS
+    n_rollouts = submodule.N_ROLLOUTS
+    policy_graphs = getattr(submodule, "POLICY_GRAPHS", None)
+    policy_mapping_fn = getattr(submodule, "policy_mapping_fn", None)
+    policies_to_train = getattr(submodule, "policies_to_train", None)
+
+    alg_run, gym_name, config = setup_exps_rllib(
+        flow_params, n_cpus, n_rollouts,
+        policy_graphs, policy_mapping_fn, policies_to_train)
+
+    ray.init(num_cpus=n_cpus + 1, object_store_memory=200 * 1024 * 1024)
+    exp_config = {
+        "run": alg_run,
+        "env": gym_name,
+        "config": {
+            **config
+        },
+        "checkpoint_freq": 20,
+        "checkpoint_at_end": True,
+        "max_failures": 999,
+        "stop": {
+            "training_iteration": flags.num_steps,
+        },
+    }
+
+    if flags.checkpoint_path is not None:
+        exp_config['restore'] = flags.checkpoint_path
+    run_experiments({flow_params["exp_tag"]: exp_config})
+
+
 def setup_exps_dqn(flow_params,
                    n_cpus,
                    n_rollouts,
@@ -381,9 +496,9 @@ def main(args):
         multiagent = False
     elif hasattr(module_ma, flags.exp_config):
         submodule = getattr(module_ma, flags.exp_config)
-        assert flags.rl_trainer.lower() in ["dqn", "h-baselines"], \
+        assert flags.rl_trainer.lower() in ["dqn", "rllib", "h-baselines"], \
             "Currently, multiagent experiments are only supported through "\
-            "DQN. Try running this experiment using DQN: " \
+            "DQN or RLlib. Try running this experiment using DQN or RLlib: " \
             "'python train.py EXP_CONFIG'"
         multiagent = True
     else:
@@ -392,13 +507,15 @@ def main(args):
     # Perform the training operation.
     if flags.rl_trainer.lower() == "dqn":
         train_dqn(submodule, flags)
+    elif flags.rl_trainer.lower() == "rllib":
+        train_rllib(submodule, flags)
     elif flags.rl_trainer.lower() == "stable-baselines":
         train_stable_baselines(submodule, flags)
     elif flags.rl_trainer.lower() == "h-baselines":
         flow_params = submodule.flow_params
         train_h_baselines(flow_params, args, multiagent)
     else:
-        raise ValueError("rl_trainer should be either 'dqn', 'h-baselines', "
+        raise ValueError("rl_trainer should be either 'dqn', 'rllib', 'h-baselines', "
                          "or 'stable-baselines'.")
 
 
From 6751b56f07a33d51b8a9aeae8ecaf596228d7384 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 00:13:46 +0300
Subject: [PATCH 10/44] change default test to dqn

---
 tests/fast_tests/test_examples.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/fast_tests/test_examples.py b/tests/fast_tests/test_examples.py
index 0b385f28a..bb5b999f4 100644
--- a/tests/fast_tests/test_examples.py
+++ b/tests/fast_tests/test_examples.py
@@ -30,6 +30,7 @@
 from examples.train import parse_args as parse_train_args
 from examples.train import run_model_stablebaseline as run_stable_baselines_model
 from examples.train import setup_exps_rllib as setup_rllib_exps
+from examples.train import setup_exps_dqn as setup_dqn_exps
 from examples.train import train_h_baselines
 
 from examples.exp_configs.non_rl.bay_bridge import flow_params as non_rl_bay_bridge
@@ -168,7 +169,7 @@ def test_parse_args(self):
 
         self.assertDictEqual(vars(args), {
             'exp_config': 'exp_config',
-            'rl_trainer': 'rllib',
+            'rl_trainer': 'dqn',
             'num_cpus': 1,
             'num_steps': 5000,
             'rollout_size': 1000,

From 4c3d9d4f345b5930bae9609dcfb6bccd862e0024 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 00:29:18 +0300
Subject: [PATCH 11/44] add TestDQNExamples for traffic light grid examples

---
 tests/fast_tests/test_examples.py | 57 +++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/fast_tests/test_examples.py b/tests/fast_tests/test_examples.py
index bb5b999f4..444463299 100644
--- a/tests/fast_tests/test_examples.py
+++ b/tests/fast_tests/test_examples.py
@@ -435,6 +435,63 @@ def run_exp(flow_params, **kwargs):
         })
 
 
+class TestDQNExamples(unittest.TestCase):
+    """Tests the example traffic light scripts in examples/exp_configs/rl/singleagent and
+    examples/exp_configs/rl/multiagent for DQN.
+
+    This is done by running each experiment in that folder for five time-steps
+    and confirming that it completes one rollout with two workers.
+    # FIXME(ev) this test adds several minutes to the testing scheme
+    """
+    def setUp(self):
+        if not ray.is_initialized():
+            ray.init(num_cpus=1)
+
+    def test_singleagent_traffic_light_grid(self):
+        self.run_exp(singleagent_traffic_light_grid)
+
+    def test_multi_traffic_light_grid(self):
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
+
+        kwargs = {
+            "policy_graphs": mtlpg,
+            "policies_to_train": mtlpt,
+            "policy_mapping_fn": mtlpmf
+        }
+        self.run_exp(multiagent_traffic_light_grid, **kwargs)
+
+    @staticmethod
+    def run_exp(flow_params, **kwargs):
+        alg_run, env_name, config = setup_dqn_exps(flow_params, 1, 1, **kwargs)
+
+        try:
+            ray.init(num_cpus=1)
+        except Exception as e:
+            print("ERROR", e)
+        config['train_batch_size'] = 50
+        config['horizon'] = 50
+        config['sample_batch_size'] = 50
+        config['num_workers'] = 0
+        config['sgd_minibatch_size'] = 32
+
+        run_experiments({
+            'test': {
+                'run': alg_run,
+                'env': env_name,
+                'config': {
+                    **config
+                },
+
+                'checkpoint_freq': 1,
+                'stop': {
+                    'training_iteration': 1,
+                },
+            }
+        })
+
+
 if __name__ == '__main__':
     try:
         ray.init(num_cpus=1)

From 3bfb9e62d807fd13720dfcedaae69f00f3ac5b4b Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Fri, 12 Jun 2020 01:04:00 +0300
Subject: [PATCH 12/44] rm light grid test for PPO

---
 tests/fast_tests/test_examples.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/fast_tests/test_examples.py b/tests/fast_tests/test_examples.py
index 444463299..ffa10c083 100644
--- a/tests/fast_tests/test_examples.py
+++ b/tests/fast_tests/test_examples.py
@@ -263,8 +263,8 @@ def setUp(self):
     def test_singleagent_figure_eight(self):
         self.run_exp(singleagent_figure_eight)
 
-    def test_singleagent_traffic_light_grid(self):
-        self.run_exp(singleagent_traffic_light_grid)
+    # def test_singleagent_traffic_light_grid(self):
+        # self.run_exp(singleagent_traffic_light_grid)
 
     def test_singleagent_traffic_light_grid_inflows(self):
         pass  # FIXME
@@ -330,17 +330,17 @@ def test_multiagent_merge(self):
         }
         self.run_exp(multiagent_merge, **kwargs)
 
-    def test_multi_traffic_light_grid(self):
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
-
-        kwargs = {
-            "policy_graphs": mtlpg,
-            "policies_to_train": mtlpt,
-            "policy_mapping_fn": mtlpmf
-        }
-        self.run_exp(multiagent_traffic_light_grid, **kwargs)
+    # def test_multi_traffic_light_grid(self):
+        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
+        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
+        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
+
+        # kwargs = {
+            # "policy_graphs": mtlpg,
+            # "policies_to_train": mtlpt,
+            # "policy_mapping_fn": mtlpmf
+        # }
+        # self.run_exp(multiagent_traffic_light_grid, **kwargs)
 
     def test_multi_highway(self):
         from examples.exp_configs.rl.multiagent.multiagent_highway import POLICY_GRAPHS as mhpg

From c6b2fd1bd42a82c16fc6b30edb99d9e15368994f Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 01:11:12 +0300
Subject: [PATCH 13/44] Update test_examples.py

---
 tests/fast_tests/test_examples.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/fast_tests/test_examples.py b/tests/fast_tests/test_examples.py
index ffa10c083..8f5df6a22 100644
--- a/tests/fast_tests/test_examples.py
+++ b/tests/fast_tests/test_examples.py
@@ -263,9 +263,6 @@ def setUp(self):
     def test_singleagent_figure_eight(self):
         self.run_exp(singleagent_figure_eight)
 
-    # def test_singleagent_traffic_light_grid(self):
-        # self.run_exp(singleagent_traffic_light_grid)
-
     def test_singleagent_traffic_light_grid_inflows(self):
         pass  # FIXME
 
@@ -330,18 +327,6 @@ def test_multiagent_merge(self):
         }
         self.run_exp(multiagent_merge, **kwargs)
 
-    # def test_multi_traffic_light_grid(self):
-        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
-        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
-        # from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
-
-        # kwargs = {
-            # "policy_graphs": mtlpg,
-            # "policies_to_train": mtlpt,
-            # "policy_mapping_fn": mtlpmf
-        # }
-        # self.run_exp(multiagent_traffic_light_grid, **kwargs)
-
     def test_multi_highway(self):
         from examples.exp_configs.rl.multiagent.multiagent_highway import POLICY_GRAPHS as mhpg
         from examples.exp_configs.rl.multiagent.multiagent_highway import POLICIES_TO_TRAIN as mhpt

From 310084193cab43ce586940577f1e43767230fcd9 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 01:47:00 +0300
Subject: [PATCH 14/44] trial

---
 flow/envs/multiagent/traffic_light_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/envs/multiagent/traffic_light_grid.py b/flow/envs/multiagent/traffic_light_grid.py
index 0f4ad7bb3..9cf3161b9 100644
--- a/flow/envs/multiagent/traffic_light_grid.py
+++ b/flow/envs/multiagent/traffic_light_grid.py
@@ -79,7 +79,7 @@ def observation_space(self):
     def action_space(self):
         """See class definition."""
         if self.discrete:
-            return Discrete(3)
+            return Discrete(2)
         else:
             return Box(
                 low=-1,

From f74eb50f551d4f360e41de596068ac80d40e928d Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 10:06:07 +0300
Subject: [PATCH 15/44] trial2

---
 flow/envs/multiagent/traffic_light_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/envs/multiagent/traffic_light_grid.py b/flow/envs/multiagent/traffic_light_grid.py
index 9cf3161b9..3731ab5d8 100644
--- a/flow/envs/multiagent/traffic_light_grid.py
+++ b/flow/envs/multiagent/traffic_light_grid.py
@@ -79,7 +79,7 @@ def observation_space(self):
     def action_space(self):
         """See class definition."""
         if self.discrete:
-            return Discrete(2)
+            return Discrete(2 ** self.num_traffic_lights)
         else:
             return Box(
                 low=-1,

From 1cd579da9389b9e2fa697152f89ddece75ecf618 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 10:28:01 +0300
Subject: [PATCH 16/44] pass ignore_reinit_error=True

---
 examples/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 82a53f05a..c82c4b497 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -193,7 +193,7 @@ def train_rllib(submodule, flags):
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
-    ray.init(num_cpus=n_cpus + 1, object_store_memory=200 * 1024 * 1024)
+    ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
     exp_config = {
         "run": alg_run,
         "env": gym_name,
@@ -310,7 +310,7 @@ def train_dqn(submodule, flags):
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
-    ray.init(num_cpus=n_cpus + 1, object_store_memory=200 * 1024 * 1024)
+    ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
     exp_config = {
         "run": alg_run,
         "env": gym_name,

From 81a203efba7d4e0f0f0f56b99038562cddee6f39 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 10:52:23 +0300
Subject: [PATCH 17/44] pass ray.shutdown() before ray.init

---
 examples/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/train.py b/examples/train.py
index c82c4b497..58cd240c1 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -193,6 +193,7 @@ def train_rllib(submodule, flags):
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
+    ray.shutdown()
     ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
     exp_config = {
         "run": alg_run,
@@ -310,6 +311,7 @@ def train_dqn(submodule, flags):
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
+    ray.shutdown()
     ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
     exp_config = {
         "run": alg_run,

From e77358247966f5718a659778abbc085ce820a7cf Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 12 Jun 2020 13:55:11 +0300
Subject: [PATCH 18/44] Update traffic_light_grid.py

---
 flow/envs/multiagent/traffic_light_grid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flow/envs/multiagent/traffic_light_grid.py b/flow/envs/multiagent/traffic_light_grid.py
index 3731ab5d8..9cf3161b9 100644
--- a/flow/envs/multiagent/traffic_light_grid.py
+++ b/flow/envs/multiagent/traffic_light_grid.py
@@ -79,7 +79,7 @@ def observation_space(self):
     def action_space(self):
         """See class definition."""
         if self.discrete:
-            return Discrete(2 ** self.num_traffic_lights)
+            return Discrete(2)
         else:
             return Box(
                 low=-1,

From eb12328004ec87bf26a702e55d6f741eb41cab53 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 12:20:38 +0300
Subject: [PATCH 19/44] update

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py | 2 +-
 .../rl/singleagent/singleagent_traffic_light_grid.py           | 3 ++-
 flow/envs/traffic_light_grid.py                                | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index a35041bdd..256198227 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -8,7 +8,7 @@
 from flow.controllers import SimCarFollowingController, GridRouter
 from ray.tune.registry import register_env
 from flow.utils.registry import make_create_env
-
+from flow.core.params import TrafficLightParams
 # Experiment parameters
 N_ROLLOUTS = 63  # number of rollouts per training iteration
 N_CPUS = 63  # number of parallel workers
diff --git a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
index 085d26be9..70f340661 100644
--- a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
@@ -145,7 +145,8 @@ def get_non_flow_params(enter_speed, add_net_params):
         'switch_time': 3.0,
         'num_observed': 2,
         'discrete': False,
-        'tl_type': 'controlled'
+        #'tl_type': 'controlled'
+        'tl_type': 'actuated'
     }
 
 additional_net_params = {
diff --git a/flow/envs/traffic_light_grid.py b/flow/envs/traffic_light_grid.py
index 8be0cb8a5..24f813ea7 100644
--- a/flow/envs/traffic_light_grid.py
+++ b/flow/envs/traffic_light_grid.py
@@ -19,7 +19,7 @@
     "switch_time": 2.0,
     # whether the traffic lights should be actuated by sumo or RL
     # options are "controlled" and "actuated"
-    "tl_type": "controlled",
+    "tl_type": "actuated",
     # determines whether the action space is meant to be discrete or continuous
     "discrete": False,
 }

From 97f75bba5894f8d575e960eebe8d8c0975965d52 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 13:00:26 +0300
Subject: [PATCH 20/44] update

---
 .../multiagent_traffic_light_grid.py          |   9 +-
 .../singleagent_traffic_light_grid.py         |   1 -
 examples/train.py                             | 174 ++++--------------
 3 files changed, 41 insertions(+), 143 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 256198227..636c7dfb1 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -1,5 +1,6 @@
 """Multi-agent traffic light example (single shared policy)."""
 
+from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
 from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from flow.envs.multiagent import MultiTrafficLightGridPOEnv
 from flow.networks import TrafficLightGridNetwork
@@ -88,7 +89,7 @@
             "target_velocity": 50,
             "switch_time": 3,
             "num_observed": 2,
-            "discrete": True,
+            "discrete": True,  # set False for DQN
             "tl_type": "actuated",
             "num_local_edges": 4,
             "num_local_lights": 4,
@@ -140,8 +141,8 @@
 
 
 def gen_policy():
-    """Generate a policy in DQN."""
-    return DQNTFPolicy, obs_space, act_space, {}
+    """Generate a policy in RLlib."""
+    return PPOTFPolicy, obs_space, act_space, {}
 
 
 # Setup PG with a single policy graph for all agents
@@ -149,7 +150,7 @@ def gen_policy():
 
 
 def policy_mapping_fn(_):
-    """Map a policy in DQN."""
+    """Map a policy in RLlib."""
     return 'av'
 
 
diff --git a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
index 70f340661..aaf97b02e 100644
--- a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
@@ -145,7 +145,6 @@ def get_non_flow_params(enter_speed, add_net_params):
         'switch_time': 3.0,
         'num_observed': 2,
         'discrete': False,
-        #'tl_type': 'controlled'
         'tl_type': 'actuated'
     }
 
diff --git a/examples/train.py b/examples/train.py
index 58cd240c1..9f5211d5c 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -1,6 +1,6 @@
 """Runner script for single and multi-agent reinforcement learning experiments.
 
-This script performs an RL experiment using the DQN algorithm. Choice of
+This script performs an RL experiment using the PPO algorithm. Choice of
 hyperparameters can be seen and adjusted from the code below.
 
 Usage
@@ -40,9 +40,13 @@ def parse_args(args):
 
     # optional input parameters
     parser.add_argument(
-        '--rl_trainer', type=str, default="dqn",
-        help='the RL trainer to use. either dqn or rllib or Stable-Baselines')
-
+        '--rl_trainer', type=str, default="rllib",
+        help='the RL trainer to use. either rllib or Stable-Baselines')
+    parser.add_argument(
+        '--algorithm', type=str, default="PPO",
+        help='RL algorithm to use. Options are PPO and DQN '
+             ' right now.'
+    )
     parser.add_argument(
         '--num_cpus', type=int, default=1,
         help='How many CPUs to use')
@@ -101,9 +105,11 @@ def run_model_stablebaseline(flow_params,
 def setup_exps_rllib(flow_params,
                      n_cpus,
                      n_rollouts,
+                     flags,
                      policy_graphs=None,
                      policy_mapping_fn=None,
-                     policies_to_train=None):
+                     policies_to_train=None,
+                     ):
     """Return the relevant components of an RLlib experiment.
 
     Parameters
@@ -114,6 +120,8 @@ def setup_exps_rllib(flow_params,
         number of CPUs to run the experiment over
     n_rollouts : int
         number of rollouts per training iteration
+    flags:
+        custom arguments
     policy_graphs : dict, optional
         TODO
     policy_mapping_fn : function, optional
@@ -139,21 +147,32 @@ def setup_exps_rllib(flow_params,
 
     horizon = flow_params['env'].horizon
 
-    alg_run = "PPO"
+    alg_run = flags.algorithm.upper()
 
     agent_cls = get_agent_class(alg_run)
     config = deepcopy(agent_cls._default_config)
-
     config["num_workers"] = n_cpus
     config["train_batch_size"] = horizon * n_rollouts
-    config["gamma"] = 0.999  # discount rate
-    config["model"].update({"fcnet_hiddens": [32, 32, 32]})
-    config["use_gae"] = True
-    config["lambda"] = 0.97
-    config["kl_target"] = 0.02
-    config["num_sgd_iter"] = 10
     config["horizon"] = horizon
 
+    if alg_run == "PPO":
+
+        config["gamma"] = 0.999  # discount rate
+        config["model"].update({"fcnet_hiddens": [32, 32, 32]})
+        config["use_gae"] = True
+        config["lambda"] = 0.97
+        config["kl_target"] = 0.02
+        config["num_sgd_iter"] = 10
+    elif alg_run == "DQN":
+        config['clip_actions'] = False
+        config["timesteps_per_iteration"] = horizon * n_rollouts
+        # https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
+        config["hiddens"] = [512]
+        config["lr"] = 0.0000625
+        config["schedule_max_timesteps"] = 2000000
+        config["buffer_size"] = 1000000
+        config["target_network_update_freq"] = 8000
+
     # save the flow params for replay
     flow_json = json.dumps(
         flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
@@ -193,125 +212,6 @@ def train_rllib(submodule, flags):
         flow_params, n_cpus, n_rollouts,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
-    ray.shutdown()
-    ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
-    exp_config = {
-        "run": alg_run,
-        "env": gym_name,
-        "config": {
-            **config
-        },
-        "checkpoint_freq": 20,
-        "checkpoint_at_end": True,
-        "max_failures": 999,
-        "stop": {
-            "training_iteration": flags.num_steps,
-        },
-    }
-
-    if flags.checkpoint_path is not None:
-        exp_config['restore'] = flags.checkpoint_path
-    run_experiments({flow_params["exp_tag"]: exp_config})
-
-
-def setup_exps_dqn(flow_params,
-                   n_cpus,
-                   n_rollouts,
-                   policy_graphs=None,
-                   policy_mapping_fn=None,
-                   policies_to_train=None):
-    """Return the relevant components of an DQN experiment.
-
-    Parameters
-    ----------
-    flow_params : dict
-        flow-specific parameters (see flow/utils/registry.py)
-    n_cpus : int
-        number of CPUs to run the experiment over
-    n_rollouts : int
-        number of rollouts per training iteration
-    policy_graphs : dict, optional
-        TODO
-    policy_mapping_fn : function, optional
-        TODO
-    policies_to_train : list of str, optional
-        TODO
-
-    Returns
-    -------
-    str
-        name of the training algorithm
-    str
-        name of the gym environment to be trained
-    dict
-        training configuration parameters
-    """
-    from ray import tune
-    from ray.tune.registry import register_env
-    try:
-        from ray.rllib.agents.agent import get_agent_class
-    except ImportError:
-        from ray.rllib.agents.registry import get_agent_class
-
-    horizon = flow_params['env'].horizon
-
-    alg_run = "DQN"
-
-    agent_cls = get_agent_class(alg_run)
-    config = deepcopy(agent_cls._default_config)
-
-    config["num_workers"] = n_cpus
-    config["train_batch_size"] = horizon * n_rollouts
-    config['clip_actions'] = False
-    config["horizon"] = horizon
-    config["timesteps_per_iteration"] = horizon * n_rollouts
-    # https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
-    config["hiddens"] = [512]
-    config["lr"] = 0.0000625
-    config["schedule_max_timesteps"] = 2000000
-    config["buffer_size"] = 1000000
-    config["target_network_update_freq"] = 8000
-
-    # save the flow params for replay
-    flow_json = json.dumps(
-        flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
-    config['env_config']['flow_params'] = flow_json
-    config['env_config']['run'] = alg_run
-
-    # multiagent configuration
-    if policy_graphs is not None:
-        print("policy_graphs", policy_graphs)
-        config['multiagent'].update({'policies': policy_graphs})
-    if policy_mapping_fn is not None:
-        config['multiagent'].update(
-            {'policy_mapping_fn': tune.function(policy_mapping_fn)})
-    if policies_to_train is not None:
-        config['multiagent'].update({'policies_to_train': policies_to_train})
-
-    create_env, gym_name = make_create_env(params=flow_params)
-
-    # Register as rllib env
-    register_env(gym_name, create_env)
-    return alg_run, gym_name, config
-
-
-def train_dqn(submodule, flags):
-    """Train policies using the DQN algorithm in DQN."""
-    import ray
-    from ray.tune import run_experiments
-
-    flow_params = submodule.flow_params
-    n_cpus = submodule.N_CPUS
-    n_rollouts = submodule.N_ROLLOUTS
-    policy_graphs = getattr(submodule, "POLICY_GRAPHS", None)
-    policy_mapping_fn = getattr(submodule, "policy_mapping_fn", None)
-    policies_to_train = getattr(submodule, "policies_to_train", None)
-
-    alg_run, gym_name, config = setup_exps_dqn(
-        flow_params, n_cpus, n_rollouts,
-        policy_graphs, policy_mapping_fn, policies_to_train)
-
-    ray.shutdown()
     ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)
     exp_config = {
         "run": alg_run,
@@ -498,18 +398,16 @@ def main(args):
         multiagent = False
     elif hasattr(module_ma, flags.exp_config):
         submodule = getattr(module_ma, flags.exp_config)
-        assert flags.rl_trainer.lower() in ["dqn", "rllib", "h-baselines"], \
+        assert flags.rl_trainer.lower() in ["rllib", "h-baselines"], \
             "Currently, multiagent experiments are only supported through "\
-            "DQN or RLlib. Try running this experiment using DQN or RLlib: " \
+            "RLlib. Try running this experiment using RLlib: " \
             "'python train.py EXP_CONFIG'"
         multiagent = True
     else:
         raise ValueError("Unable to find experiment config.")
 
     # Perform the training operation.
-    if flags.rl_trainer.lower() == "dqn":
-        train_dqn(submodule, flags)
-    elif flags.rl_trainer.lower() == "rllib":
+    if flags.rl_trainer.lower() == "rllib":
         train_rllib(submodule, flags)
     elif flags.rl_trainer.lower() == "stable-baselines":
         train_stable_baselines(submodule, flags)
@@ -517,7 +415,7 @@ def main(args):
         flow_params = submodule.flow_params
         train_h_baselines(flow_params, args, multiagent)
     else:
-        raise ValueError("rl_trainer should be either 'dqn', 'rllib', 'h-baselines', "
+        raise ValueError("rl_trainer should be either 'rllib', 'h-baselines', "
                          "or 'stable-baselines'.")
 
 
From e7c7ea0e7848a884fa5f964d735aaf7d95671eab Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 13:09:14 +0300
Subject: [PATCH 21/44] update

---
 examples/train.py                 |  4 +-
 tests/fast_tests/test_examples.py | 75 +++++++------------------------
 2 files changed, 17 insertions(+), 62 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 9f5211d5c..20dc00a82 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -44,9 +44,7 @@ def parse_args(args):
         help='the RL trainer to use. either rllib or Stable-Baselines')
     parser.add_argument(
         '--algorithm', type=str, default="PPO",
-        help='RL algorithm to use. Options are PPO and DQN '
-             ' right now.'
-    )
+        help='RL algorithm to use. Options are PPO and DQN right now.')
     parser.add_argument(
         '--num_cpus', type=int, default=1,
         help='How many CPUs to use')
diff --git a/tests/fast_tests/test_examples.py b/tests/fast_tests/test_examples.py
index 8f5df6a22..0b385f28a 100644
--- a/tests/fast_tests/test_examples.py
+++ b/tests/fast_tests/test_examples.py
@@ -30,7 +30,6 @@
 from examples.train import parse_args as parse_train_args
 from examples.train import run_model_stablebaseline as run_stable_baselines_model
 from examples.train import setup_exps_rllib as setup_rllib_exps
-from examples.train import setup_exps_dqn as setup_dqn_exps
 from examples.train import train_h_baselines
 
 from examples.exp_configs.non_rl.bay_bridge import flow_params as non_rl_bay_bridge
@@ -169,7 +168,7 @@ def test_parse_args(self):
 
         self.assertDictEqual(vars(args), {
             'exp_config': 'exp_config',
-            'rl_trainer': 'dqn',
+            'rl_trainer': 'rllib',
             'num_cpus': 1,
             'num_steps': 5000,
             'rollout_size': 1000,
@@ -263,6 +262,9 @@ def setUp(self):
     def test_singleagent_figure_eight(self):
         self.run_exp(singleagent_figure_eight)
 
+    def test_singleagent_traffic_light_grid(self):
+        self.run_exp(singleagent_traffic_light_grid)
+
     def test_singleagent_traffic_light_grid_inflows(self):
         pass  # FIXME
 
@@ -327,6 +329,18 @@ def test_multiagent_merge(self):
         }
         self.run_exp(multiagent_merge, **kwargs)
 
+    def test_multi_traffic_light_grid(self):
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
+        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
+
+        kwargs = {
+            "policy_graphs": mtlpg,
+            "policies_to_train": mtlpt,
+            "policy_mapping_fn": mtlpmf
+        }
+        self.run_exp(multiagent_traffic_light_grid, **kwargs)
+
     def test_multi_highway(self):
         from examples.exp_configs.rl.multiagent.multiagent_highway import POLICY_GRAPHS as mhpg
         from examples.exp_configs.rl.multiagent.multiagent_highway import POLICIES_TO_TRAIN as mhpt
@@ -420,63 +434,6 @@ def run_exp(flow_params, **kwargs):
         })
 
 
-class TestDQNExamples(unittest.TestCase):
-    """Tests the example traffic light scripts in examples/exp_configs/rl/singleagent and
-    examples/exp_configs/rl/multiagent for DQN.
-
-    This is done by running each experiment in that folder for five time-steps
-    and confirming that it completes one rollout with two workers.
-    # FIXME(ev) this test adds several minutes to the testing scheme
-    """
-    def setUp(self):
-        if not ray.is_initialized():
-            ray.init(num_cpus=1)
-
-    def test_singleagent_traffic_light_grid(self):
-        self.run_exp(singleagent_traffic_light_grid)
-
-    def test_multi_traffic_light_grid(self):
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICY_GRAPHS as mtlpg
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import POLICIES_TO_TRAIN as mtlpt
-        from examples.exp_configs.rl.multiagent.multiagent_traffic_light_grid import policy_mapping_fn as mtlpmf
-
-        kwargs = {
-            "policy_graphs": mtlpg,
-            "policies_to_train": mtlpt,
-            "policy_mapping_fn": mtlpmf
-        }
-        self.run_exp(multiagent_traffic_light_grid, **kwargs)
-
-    @staticmethod
-    def run_exp(flow_params, **kwargs):
-        alg_run, env_name, config = setup_dqn_exps(flow_params, 1, 1, **kwargs)
-
-        try:
-            ray.init(num_cpus=1)
-        except Exception as e:
-            print("ERROR", e)
-        config['train_batch_size'] = 50
-        config['horizon'] = 50
-        config['sample_batch_size'] = 50
-        config['num_workers'] = 0
-        config['sgd_minibatch_size'] = 32
-
-        run_experiments({
-            'test': {
-                'run': alg_run,
-                'env': env_name,
-                'config': {
-                    **config
-                },
-
-                'checkpoint_freq': 1,
-                'stop': {
-                    'training_iteration': 1,
-                },
-            }
-        })
-
-
 if __name__ == '__main__':
     try:
         ray.init(num_cpus=1)

From bdb9ccc70be3399fca6577b4156133702ebd6ce6 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 13:11:05 +0300
Subject: [PATCH 22/44] update

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py  | 2 +-
 .../rl/singleagent/singleagent_traffic_light_grid.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 636c7dfb1..7fbdaf703 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -9,7 +9,7 @@
 from flow.controllers import SimCarFollowingController, GridRouter
 from ray.tune.registry import register_env
 from flow.utils.registry import make_create_env
-from flow.core.params import TrafficLightParams
+
 # Experiment parameters
 N_ROLLOUTS = 63  # number of rollouts per training iteration
 N_CPUS = 63  # number of parallel workers
diff --git a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
index aaf97b02e..53a474452 100644
--- a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
@@ -144,7 +144,7 @@ def get_non_flow_params(enter_speed, add_net_params):
         'target_velocity': 50,
         'switch_time': 3.0,
         'num_observed': 2,
-        'discrete': False,
+        'discrete': False, # set True for DQN
         'tl_type': 'actuated'
     }
 

From c36d2235d03d570f61687f42808256eb737d1382 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 13:13:31 +0300
Subject: [PATCH 23/44] typo

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 7fbdaf703..017ebd649 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -89,7 +89,7 @@
             "target_velocity": 50,
             "switch_time": 3,
             "num_observed": 2,
-            "discrete": True,  # set False for DQN
+            "discrete": False,  # set True for DQN
             "tl_type": "actuated",
             "num_local_edges": 4,
             "num_local_lights": 4,

From 720bcad4710e4b2947fc5ea0c5c7cbbb0c50feec Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Mon, 22 Jun 2020 13:31:58 +0300
Subject: [PATCH 24/44] update

---
 .../multiagent_traffic_light_grid.py           |  5 +++--
 examples/train.py                              | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 017ebd649..cf556c231 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -89,7 +89,7 @@
             "target_velocity": 50,
             "switch_time": 3,
             "num_observed": 2,
-            "discrete": False,  # set True for DQN
+            "discrete": True,  # set True for DQN
             "tl_type": "actuated",
             "num_local_edges": 4,
             "num_local_lights": 4,
@@ -142,7 +142,8 @@
 
 def gen_policy():
     """Generate a policy in RLlib."""
-    return PPOTFPolicy, obs_space, act_space, {}
+    #return PPOTFPolicy, obs_space, act_space, {}
+    return DQNTFPolicy, obs_space, act_space, {}
 
 
 # Setup PG with a single policy graph for all agents
diff --git a/examples/train.py b/examples/train.py
index 20dc00a82..d2d2ece50 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -138,6 +138,7 @@ def setup_exps_rllib(flow_params,
     """
     from ray import tune
     from ray.tune.registry import register_env
+    from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper
     try:
         from ray.rllib.agents.agent import get_agent_class
     except ImportError:
@@ -147,14 +148,9 @@ def setup_exps_rllib(flow_params,
 
     alg_run = flags.algorithm.upper()
 
-    agent_cls = get_agent_class(alg_run)
-    config = deepcopy(agent_cls._default_config)
-    config["num_workers"] = n_cpus
-    config["train_batch_size"] = horizon * n_rollouts
-    config["horizon"] = horizon
-
     if alg_run == "PPO":
-
+        agent_cls = get_agent_class(alg_run)
+        config = deepcopy(agent_cls._default_config)
         config["gamma"] = 0.999  # discount rate
         config["model"].update({"fcnet_hiddens": [32, 32, 32]})
         config["use_gae"] = True
@@ -162,6 +158,8 @@ def setup_exps_rllib(flow_params,
         config["kl_target"] = 0.02
         config["num_sgd_iter"] = 10
     elif alg_run == "DQN":
+        agent_cls = get_agent_class(alg_run)
+        config = deepcopy(agent_cls._default_config)
         config['clip_actions'] = False
         config["timesteps_per_iteration"] = horizon * n_rollouts
         # https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/dqn/atari-dist-dqn.yaml
@@ -171,6 +169,10 @@ def setup_exps_rllib(flow_params,
         config["buffer_size"] = 1000000
         config["target_network_update_freq"] = 8000
 
+    config["num_workers"] = n_cpus
+    config["train_batch_size"] = horizon * n_rollouts
+    config["horizon"] = horizon
+
     # save the flow params for replay
     flow_json = json.dumps(
         flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
@@ -207,7 +209,7 @@ def train_rllib(submodule, flags):
     policies_to_train = getattr(submodule, "policies_to_train", None)
 
     alg_run, gym_name, config = setup_exps_rllib(
-        flow_params, n_cpus, n_rollouts,
+        flow_params, n_cpus, n_rollouts, flags,
         policy_graphs, policy_mapping_fn, policies_to_train)
 
     ray.init(num_cpus=n_cpus + 1, ignore_reinit_error=True, object_store_memory=200 * 1024 * 1024)

From 91a0a0ba5af451496a7b35c6851e6a8842c895f3 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Mon, 22 Jun 2020 13:36:25 +0300
Subject: [PATCH 25/44] Update singleagent_traffic_light_grid.py

---
 .../rl/singleagent/singleagent_traffic_light_grid.py            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
index 53a474452..11f94023d 100644
--- a/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/singleagent/singleagent_traffic_light_grid.py
@@ -144,7 +144,7 @@ def get_non_flow_params(enter_speed, add_net_params):
         'target_velocity': 50,
         'switch_time': 3.0,
         'num_observed': 2,
-        'discrete': False, # set True for DQN
+        'discrete': False,  # set True for DQN
         'tl_type': 'actuated'
     }
 

From 50be0ae772acfd7730249def4afd40956e846ef0 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Mon, 22 Jun 2020 13:45:38 +0300
Subject: [PATCH 26/44] Update multiagent_traffic_light_grid.py

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index cf556c231..ae6a4040a 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -142,7 +142,7 @@
 
 def gen_policy():
     """Generate a policy in RLlib."""
-    #return PPOTFPolicy, obs_space, act_space, {}
+    # return PPOTFPolicy, obs_space, act_space, {}
     return DQNTFPolicy, obs_space, act_space, {}
 
 
From 6303be2bdeb735cb3a57e4fedc7a4ba64bb07b3b Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Tue, 23 Jun 2020 21:59:10 +0300
Subject: [PATCH 27/44] Update train.py

---
 examples/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/train.py b/examples/train.py
index d2d2ece50..529875dc5 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -138,7 +138,6 @@ def setup_exps_rllib(flow_params,
     """
     from ray import tune
     from ray.tune.registry import register_env
-    from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper
     try:
         from ray.rllib.agents.agent import get_agent_class
     except ImportError:

From 658b9cb735f5a0834eacc49cce6c8e0ef58ec9fc Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Tue, 23 Jun 2020 22:01:26 +0300
Subject: [PATCH 28/44] Update multiagent_traffic_light_grid.py

---
 .../rl/multiagent/multiagent_traffic_light_grid.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index ae6a4040a..589206319 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -1,7 +1,7 @@
 """Multi-agent traffic light example (single shared policy)."""
 
 from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
-from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
+# from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from flow.envs.multiagent import MultiTrafficLightGridPOEnv
 from flow.networks import TrafficLightGridNetwork
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams
@@ -89,7 +89,7 @@
             "target_velocity": 50,
             "switch_time": 3,
             "num_observed": 2,
-            "discrete": True,  # set True for DQN
+            "discrete": False,  # set True for DQN
             "tl_type": "actuated",
             "num_local_edges": 4,
             "num_local_lights": 4,
@@ -142,8 +142,8 @@
 
 def gen_policy():
     """Generate a policy in RLlib."""
-    # return PPOTFPolicy, obs_space, act_space, {}
-    return DQNTFPolicy, obs_space, act_space, {}
+    return PPOTFPolicy, obs_space, act_space, {}
+    # return DQNTFPolicy, obs_space, act_space, {}
 
 
 # Setup PG with a single policy graph for all agents

From 4984ffc3623237be502dfbbb3a66f0ef09150480 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <pengyuan.zhou@helsinki.fi>
Date: Tue, 23 Jun 2020 22:38:49 +0300
Subject: [PATCH 29/44] rm flow-project from travis.yml

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 297281bc7..82c9a7283 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,13 +41,13 @@ before_install:
   - source activate flow
 
   # [sumo] dependencies and binaries
-  - pushd $HOME/build/flow-project
+  - pushd $HOME/build
   -     ./flow/scripts/setup_sumo_ubuntu1604.sh
   - popd
   - source ~/.bashrc
 
   # [aimsun] install the conda env and update the path to the env
-  - pushd $HOME/build/flow-project
+  - pushd $HOME/build
   -     ./flow/scripts/setup_aimsun.sh
   - popd
   - source ~/.bashrc

From 2535b0c7bac5a88fb652eaabb1bea1a875cf6ac4 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Tue, 8 Sep 2020 13:06:36 +0300
Subject: [PATCH 30/44] Update .travis.yml

---
 .travis.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 82c9a7283..fd98761f7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,13 +41,11 @@ before_install:
   - source activate flow
 
   # [sumo] dependencies and binaries
-  - pushd $HOME/build
   -     ./flow/scripts/setup_sumo_ubuntu1604.sh
   - popd
   - source ~/.bashrc
 
   # [aimsun] install the conda env and update the path to the env
-  - pushd $HOME/build
   -     ./flow/scripts/setup_aimsun.sh
   - popd
   - source ~/.bashrc

From 95d9182f503bf5c9034bd365066e49bd2605adea Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Tue, 8 Sep 2020 13:07:33 +0300
Subject: [PATCH 31/44] Update multiagent_traffic_light_grid.py

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 589206319..44dec14c1 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -143,7 +143,6 @@
 def gen_policy():
     """Generate a policy in RLlib."""
     return PPOTFPolicy, obs_space, act_space, {}
-    # return DQNTFPolicy, obs_space, act_space, {}
 
 
 # Setup PG with a single policy graph for all agents

From b3876467df47dfea5f4ca1dbfd6cefcbd5738d0b Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Tue, 8 Sep 2020 13:08:48 +0300
Subject: [PATCH 32/44] Update train.py

---
 examples/train.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/train.py b/examples/train.py
index 0a20ff8e1..be72c34eb 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -106,8 +106,7 @@ def setup_exps_rllib(flow_params,
                      flags,
                      policy_graphs=None,
                      policy_mapping_fn=None,
-                     policies_to_train=None,
-                     ):
+                     policies_to_train=None):
     """Return the relevant components of an RLlib experiment.
 
     Parameters

From c0d41cf1ebfd7d0f8a66ea4c2c4aca058ab8c236 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 18 Dec 2020 17:48:52 +0200
Subject: [PATCH 33/44] Update README.md

---
 README.md | 45 ++++++++++-----------------------------------
 1 file changed, 10 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 7d37223c5..1adc1e0e6 100644
--- a/README.md
+++ b/README.md
@@ -1,45 +1,20 @@
-<img src="docs/img/square_logo.png" align="right" width="25%"/>
+# Decentralized Reinforcement Learning Traffic Light Control
 
-[![Build Status](https://travis-ci.com/flow-project/flow.svg?branch=master)](https://travis-ci.com/flow-project/flow)
-[![Docs](https://readthedocs.org/projects/flow/badge)](http://flow.readthedocs.org/en/latest/)
-[![Coverage Status](https://coveralls.io/repos/github/flow-project/flow/badge.svg?branch=master)](https://coveralls.io/github/flow-project/flow?branch=master)
-[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/flow-project/flow/binder)
-[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/flow-project/flow/blob/master/LICENSE.md)
+DQN branch contains the code for multi-agent DQN controlled intelligent traffic lights.
 
-# Flow
+Currently DQN branch has been approved by the original Flow community and waiting to be merged.
 
-[Flow](https://flow-project.github.io/) is a computational framework for deep RL and control experiments for traffic microsimulation.
+For detailed changes compared to original Flow code, please refer to the [PR](https://github.com/flow-project/flow/pull/964)
 
-See [our website](https://flow-project.github.io/) for more information on the application of Flow to several mixed-autonomy traffic scenarios. Other [results and videos](https://sites.google.com/view/ieee-tro-flow/home) are available as well.
+# Citing 
 
-# More information
+For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our paper:
 
-- [Documentation](https://flow.readthedocs.org/en/latest/)
-- [Installation instructions](http://flow.readthedocs.io/en/latest/flow_setup.html)
-- [Tutorials](https://github.com/flow-project/flow/tree/master/tutorials)
-- [Binder Build (beta)](https://mybinder.org/v2/gh/flow-project/flow/binder)
+P. Zhou, X. Chen, Z. Liu, T. Braud, P. Hui and J. Kangasharju, "DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV," in IEEE Transactions on Intelligent Transportation Systems, doi: 10.1109/TITS.2020.3035841.
 
-# Technical questions
+or
 
-If you have a bug, please report it. Otherwise, join the [Flow Users group](https://join.slack.com/t/flow-users/shared_invite/enQtODQ0NDYxMTQyNDY2LTY1ZDVjZTljM2U0ODIxNTY5NTQ2MmUxMzYzNzc5NzU4ZTlmNGI2ZjFmNGU4YjVhNzE3NjcwZTBjNzIxYTg5ZmY) on Slack!  
+Zhou, P., Chen, X., Liu, Z., Braud, T., Hui, P. and Kangasharju, J., 2020. DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control. arXiv preprint arXiv:2009.01502.
 
-# Getting involved
 
-We welcome your contributions.
-
-- Please report bugs and improvements by submitting [GitHub issue](https://github.com/flow-project/flow/issues).
-- Submit your contributions using [pull requests](https://github.com/flow-project/flow/pulls). Please use [this template](https://github.com/flow-project/flow/blob/master/.github/PULL_REQUEST_TEMPLATE.md) for your pull requests.
-
-# Citing Flow
-
-If you use Flow for academic research, you are highly encouraged to cite our paper:
-
-C. Wu, A. Kreidieh, K. Parvate, E. Vinitsky, A. Bayen, "Flow: Architecture and Benchmarking for Reinforcement Learning in Traffic Control," CoRR, vol. abs/1710.05465, 2017. [Online]. Available: https://arxiv.org/abs/1710.05465
-
-If you use the benchmarks, you are highly encouraged to cite our paper:
-
-Vinitsky, E., Kreidieh, A., Le Flem, L., Kheterpal, N., Jang, K., Wu, F., ... & Bayen, A. M,  Benchmarks for reinforcement learning in mixed-autonomy traffic. In Conference on Robot Learning (pp. 399-409). Available: http://proceedings.mlr.press/v87/vinitsky18a.html
-
-# Contributors
-
-Flow is supported by the [Mobile Sensing Lab](http://bayen.eecs.berkeley.edu/) at UC Berkeley and Amazon AWS Machine Learning research grants. The contributors are listed in [Flow Team Page](https://flow-project.github.io/team.html).
+Original instructions, please refer to [Flow](https://flow-project.github.io/) 

From 00c5c6c3de8598feb89aabaa7699784c51575528 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Fri, 18 Dec 2020 17:49:26 +0200
Subject: [PATCH 34/44] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1adc1e0e6..2f66265b2 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ For detailed changes compared to original Flow code, please refer to the [PR](ht
 
 # Citing 
 
-For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our paper:
+For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our [paper](https://arxiv.org/pdf/2009.01502.pdf):
 
 P. Zhou, X. Chen, Z. Liu, T. Braud, P. Hui and J. Kangasharju, "DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV," in IEEE Transactions on Intelligent Transportation Systems, doi: 10.1109/TITS.2020.3035841.
 

From 85a84e4d472af07d86378d143723100f6b91e7d2 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:07:53 +0200
Subject: [PATCH 35/44] Set theme jekyll-theme-midnight

---
 _config.yml | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 _config.yml

diff --git a/_config.yml b/_config.yml
new file mode 100644
index 000000000..18854876c
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file

From ef777f737496bb4bcded1d4e457fd3508b4960a8 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:08:27 +0200
Subject: [PATCH 36/44] Set theme jekyll-theme-midnight


From 04631ad00cd2ea2389006f831b38f1ee7255828c Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:10:58 +0200
Subject: [PATCH 37/44] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 2f66265b2..c17f4bf38 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ Currently DQN branch has been approved by the original Flow community and waitin
 
 For detailed changes compared to original Flow code, please refer to the [PR](https://github.com/flow-project/flow/pull/964)
 
+A quick [Demo](https://youtu.be/p2sMtN_mW8s) 
+
 # Citing 
 
 For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our [paper](https://arxiv.org/pdf/2009.01502.pdf):

From bdf80f2496b81307a5d4f4a2077a862cd0ced757 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:14:08 +0200
Subject: [PATCH 38/44] Set theme jekyll-theme-cayman

---
 _config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_config.yml b/_config.yml
index 18854876c..c4192631f 100644
--- a/_config.yml
+++ b/_config.yml
@@ -1 +1 @@
-theme: jekyll-theme-midnight
\ No newline at end of file
+theme: jekyll-theme-cayman
\ No newline at end of file

From 3b998de416be3f5795fa09ab65524371cac976b0 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:28:08 +0200
Subject: [PATCH 39/44] Update README.md

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index c17f4bf38..afd498947 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,5 @@ For more theoretical details such as optima proof, system design, and performanc
 
 P. Zhou, X. Chen, Z. Liu, T. Braud, P. Hui and J. Kangasharju, "DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV," in IEEE Transactions on Intelligent Transportation Systems, doi: 10.1109/TITS.2020.3035841.
 
-or
-
-Zhou, P., Chen, X., Liu, Z., Braud, T., Hui, P. and Kangasharju, J., 2020. DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control. arXiv preprint arXiv:2009.01502.
-
 
 Original instructions, please refer to [Flow](https://flow-project.github.io/) 

From c9a7377ea9f7fb6ebe481528e642701fd7ae092b Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 02:53:16 +0200
Subject: [PATCH 40/44] Update README.md

---
 README.md | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index afd498947..b4738081f 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,20 @@ Currently DQN branch has been approved by the original Flow community and waitin
 
 For detailed changes compared to original Flow code, please refer to the [PR](https://github.com/flow-project/flow/pull/964)
 
-A quick [Demo](https://youtu.be/p2sMtN_mW8s) 
-
 # Citing 
 
 For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our [paper](https://arxiv.org/pdf/2009.01502.pdf):
-
-P. Zhou, X. Chen, Z. Liu, T. Braud, P. Hui and J. Kangasharju, "DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV," in IEEE Transactions on Intelligent Transportation Systems, doi: 10.1109/TITS.2020.3035841.
+```
+@ARTICLE{9275391,
+  author={P. {Zhou} and X. {Chen} and Z. {Liu} and T. {Braud} and P. {Hui} and J. {Kangasharju}},
+  journal={IEEE Transactions on Intelligent Transportation Systems}, 
+  title={DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV}, 
+  year={2020},
+  doi={10.1109/TITS.2020.3035841}}
+```
+
+A quick Demo:
+[![Video](https://i9.ytimg.com/vi/p2sMtN_mW8s/maxresdefault.jpg?time=1608339000000&sqp=CLic9f4F&rs=AOn4CLDlfAcq4ONYwic9lK3Bx7MDsLbq1A)](https://youtu.be/p2sMtN_mW8s) 
 
 
 Original instructions, please refer to [Flow](https://flow-project.github.io/) 

From d0ab1b07fd23f29bae1f0f2e2db97a8355a0eb7c Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Sat, 19 Dec 2020 06:46:48 +0200
Subject: [PATCH 41/44] Update README.md

---
 README.md | 54 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index b4738081f..7d37223c5 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,45 @@
-# Decentralized Reinforcement Learning Traffic Light Control
+<img src="docs/img/square_logo.png" align="right" width="25%"/>
 
-DQN branch contains the code for multi-agent DQN controlled intelligent traffic lights.
+[![Build Status](https://travis-ci.com/flow-project/flow.svg?branch=master)](https://travis-ci.com/flow-project/flow)
+[![Docs](https://readthedocs.org/projects/flow/badge)](http://flow.readthedocs.org/en/latest/)
+[![Coverage Status](https://coveralls.io/repos/github/flow-project/flow/badge.svg?branch=master)](https://coveralls.io/github/flow-project/flow?branch=master)
+[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/flow-project/flow/binder)
+[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/flow-project/flow/blob/master/LICENSE.md)
 
-Currently DQN branch has been approved by the original Flow community and waiting to be merged.
+# Flow
 
-For detailed changes compared to original Flow code, please refer to the [PR](https://github.com/flow-project/flow/pull/964)
+[Flow](https://flow-project.github.io/) is a computational framework for deep RL and control experiments for traffic microsimulation.
 
-# Citing 
+See [our website](https://flow-project.github.io/) for more information on the application of Flow to several mixed-autonomy traffic scenarios. Other [results and videos](https://sites.google.com/view/ieee-tro-flow/home) are available as well.
 
-For more theoretical details such as optima proof, system design, and performance comparison, please refer and cite our [paper](https://arxiv.org/pdf/2009.01502.pdf):
-```
-@ARTICLE{9275391,
-  author={P. {Zhou} and X. {Chen} and Z. {Liu} and T. {Braud} and P. {Hui} and J. {Kangasharju}},
-  journal={IEEE Transactions on Intelligent Transportation Systems}, 
-  title={DRLE: Decentralized Reinforcement Learning at the Edge for Traffic Light Control in the IoV}, 
-  year={2020},
-  doi={10.1109/TITS.2020.3035841}}
-```
+# More information
 
-A quick Demo:
-[![Video](https://i9.ytimg.com/vi/p2sMtN_mW8s/maxresdefault.jpg?time=1608339000000&sqp=CLic9f4F&rs=AOn4CLDlfAcq4ONYwic9lK3Bx7MDsLbq1A)](https://youtu.be/p2sMtN_mW8s) 
+- [Documentation](https://flow.readthedocs.org/en/latest/)
+- [Installation instructions](http://flow.readthedocs.io/en/latest/flow_setup.html)
+- [Tutorials](https://github.com/flow-project/flow/tree/master/tutorials)
+- [Binder Build (beta)](https://mybinder.org/v2/gh/flow-project/flow/binder)
 
+# Technical questions
 
-Original instructions, please refer to [Flow](https://flow-project.github.io/) 
+If you have a bug, please report it. Otherwise, join the [Flow Users group](https://join.slack.com/t/flow-users/shared_invite/enQtODQ0NDYxMTQyNDY2LTY1ZDVjZTljM2U0ODIxNTY5NTQ2MmUxMzYzNzc5NzU4ZTlmNGI2ZjFmNGU4YjVhNzE3NjcwZTBjNzIxYTg5ZmY) on Slack!  
+
+# Getting involved
+
+We welcome your contributions.
+
+- Please report bugs and improvements by submitting [GitHub issue](https://github.com/flow-project/flow/issues).
+- Submit your contributions using [pull requests](https://github.com/flow-project/flow/pulls). Please use [this template](https://github.com/flow-project/flow/blob/master/.github/PULL_REQUEST_TEMPLATE.md) for your pull requests.
+
+# Citing Flow
+
+If you use Flow for academic research, you are highly encouraged to cite our paper:
+
+C. Wu, A. Kreidieh, K. Parvate, E. Vinitsky, A. Bayen, "Flow: Architecture and Benchmarking for Reinforcement Learning in Traffic Control," CoRR, vol. abs/1710.05465, 2017. [Online]. Available: https://arxiv.org/abs/1710.05465
+
+If you use the benchmarks, you are highly encouraged to cite our paper:
+
+Vinitsky, E., Kreidieh, A., Le Flem, L., Kheterpal, N., Jang, K., Wu, F., ... & Bayen, A. M,  Benchmarks for reinforcement learning in mixed-autonomy traffic. In Conference on Robot Learning (pp. 399-409). Available: http://proceedings.mlr.press/v87/vinitsky18a.html
+
+# Contributors
+
+Flow is supported by the [Mobile Sensing Lab](http://bayen.eecs.berkeley.edu/) at UC Berkeley and Amazon AWS Machine Learning research grants. The contributors are listed in [Flow Team Page](https://flow-project.github.io/team.html).

From 1172cb3440563b173f252a1d9298a0d0db32457e Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Mon, 21 Dec 2020 01:48:55 +0200
Subject: [PATCH 42/44] Update .travis.yml

---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index fd98761f7..297281bc7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,11 +41,13 @@ before_install:
   - source activate flow
 
   # [sumo] dependencies and binaries
+  - pushd $HOME/build/flow-project
   -     ./flow/scripts/setup_sumo_ubuntu1604.sh
   - popd
   - source ~/.bashrc
 
   # [aimsun] install the conda env and update the path to the env
+  - pushd $HOME/build/flow-project
   -     ./flow/scripts/setup_aimsun.sh
   - popd
   - source ~/.bashrc

From c01258b20b9515689230f9ccc89874a3589d6360 Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Mon, 21 Dec 2020 01:51:50 +0200
Subject: [PATCH 43/44] Delete _config.yml

---
 _config.yml | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 _config.yml

diff --git a/_config.yml b/_config.yml
deleted file mode 100644
index c4192631f..000000000
--- a/_config.yml
+++ /dev/null
@@ -1 +0,0 @@
-theme: jekyll-theme-cayman
\ No newline at end of file

From 1282c67a6d114f08aa38f44e12ac0465a70bc3eb Mon Sep 17 00:00:00 2001
From: Pengyuan Zhou <zpymyyn@gmail.com>
Date: Mon, 21 Dec 2020 01:52:18 +0200
Subject: [PATCH 44/44] Update multiagent_traffic_light_grid.py

---
 .../exp_configs/rl/multiagent/multiagent_traffic_light_grid.py   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
index 44dec14c1..88c412946 100644
--- a/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
+++ b/examples/exp_configs/rl/multiagent/multiagent_traffic_light_grid.py
@@ -1,7 +1,6 @@
 """Multi-agent traffic light example (single shared policy)."""
 
 from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
-# from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from flow.envs.multiagent import MultiTrafficLightGridPOEnv
 from flow.networks import TrafficLightGridNetwork
 from flow.core.params import SumoParams, EnvParams, InitialConfig, NetParams