diff --git a/examples/game_of_tag/README.md b/examples/game_of_tag/README.md new file mode 100644 index 0000000000..03cd28d98e --- /dev/null +++ b/examples/game_of_tag/README.md @@ -0,0 +1,49 @@ +# Game of Tag +This directory contains a a multi-agent adversarial training demo. In the demo, there is a predator vehicle and a prey vehicle. +The predator vehicle's goal is to catch the prey, and the prey vehicle's goal is to avoid getting caught. + +## Run training +python examples/game_of_tag/game_of_tag.py examples/game_of_tag/scenarios/game_of_tag_demo_map/ + +## Run checkpoint +python examples/game_of_tag/run_checkpoint.py examples/game_of_tag/scenarios/game_of_tag_demo_map/ + +## Setup: +### Rewards +The formula for reward is 0.5/(distance-COLLIDE_DISTANCE)^2 and capped at 10 + +- COLLIDE_DISTANCE is the observed distance when two vehicle collides. Since the position of two vehicle is at the center, the distance when collesion happens is not exactly 0. + +### Common Reward: + Off road: -10 + +#### Prey: + Collision with predator: -10 + Distance to predator(d): 0.5/(d-COLLIDE_DISTANCE)^2 +#### Predator: + Collision with predator: -10 + Distance to predator(d): 0.5/(d-COLLIDE_DISTANCE)^2 + +### Action: +Speed selection in m/s: [0, 3, 6, 9] + +Lane change selection relative to current lane: [-1, 0, 1] + +## Output a model: +Currently Rllib does not have implementation for exporting a pytorch model. + +Replace `export_model`'s implementation in `ray/rllib/policy/torch_policy.py` to the following: +``` +torch.save(self.model.state_dict(),f"{export_dir}/model.pt") +``` +Then follow the steps in game_of_tag.py to export the model. + +## Possible next steps +- Increase the number of agents to 2 predators and 2 prey. +This requires modelling the reward to still be a zero sum game. The complication can be understood from +how to model the distance reward between 2 predators and 1 prey. If the reward is only from nearest predator +to nearest prey, the sum of predator and prey rewards will no longer be 0 because 2 predators will be getting full +reward from 1 prey but the prey will only get full reward from 1 predator. This will require the predators to know about each +other or the prey to know about other prey, and the prey to know about multiple predators. +- Add an attribute in observations to display whether the ego car is in front of the target vehicle or behind it, this may +help to let ego vehicle know whether it should slow down or speed up \ No newline at end of file diff --git a/examples/game_of_tag/game_of_tag.py b/examples/game_of_tag/game_of_tag.py new file mode 100644 index 0000000000..fafd3620cb --- /dev/null +++ b/examples/game_of_tag/game_of_tag.py @@ -0,0 +1,270 @@ +"""Let's play tag! + +A predator-prey multi-agent example built on top of RLlib to facilitate further +developments on multi-agent support for HiWay (including design, performance, +research, and scaling). + +The predator and prey use separate policies. A predator "catches" its prey when +it collides into the other vehicle. There can be multiple predators and +multiple prey in a map. Social vehicles act as obstacles where both the +predator and prey must avoid them. +""" +import argparse +import os +import random +import multiprocessing +import ray + + +import numpy as np +from typing import List +from ray import tune +from ray.rllib.utils import try_import_tf +from ray.rllib.models import ModelCatalog +from ray.tune import Stopper +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.tune.schedulers import PopulationBasedTraining +from ray.rllib.agents.ppo import PPOTrainer +from pathlib import Path + +from smarts.env.rllib_hiway_env import RLlibHiWayEnv +from smarts.core.agent import AgentSpec, Agent +from smarts.core.controllers import ActionSpaceType +from smarts.core.agent_interface import AgentInterface, AgentType, DoneCriteria +from smarts.core.utils.file import copy_tree + + +from examples.game_of_tag.tag_adapters import * +from examples.game_of_tag.model import CustomFCModel + + +# Add custom metrics to your tensorboard using these callbacks +# see: https://ray.readthedocs.io/en/latest/rllib-training.html#callbacks-and-custom-metrics +def on_episode_start(info): + episode = info["episode"] + print("episode {} started".format(episode.episode_id)) + + +def on_episode_step(info): + episode = info["episode"] + single_agent_id = list(episode._agent_to_last_obs)[0] + obs = episode.last_raw_obs_for(single_agent_id) + + +def on_episode_end(info): + episode = info["episode"] + + +def explore(config): + # ensure we collect enough timesteps to do sgd + if config["train_batch_size"] < config["sgd_minibatch_size"] * 2: + config["train_batch_size"] = config["sgd_minibatch_size"] * 2 + # ensure we run at least one sgd iter + if config["num_sgd_iter"] < 1: + config["num_sgd_iter"] = 1 + return config + + +PREDATOR_POLICY = "predator_policy" +PREY_POLICY = "prey_policy" + + +def policy_mapper(agent_id): + if agent_id in PREDATOR_IDS: + return PREDATOR_POLICY + elif agent_id in PREY_IDS: + return PREY_POLICY + + +class TimeStopper(Stopper): + def __init__(self): + self._start = time.time() + # Currently will see obvious tag behaviour in 6 hours + self._deadline = 48 * 60 * 60 # train for 48 hours + + def __call__(self, trial_id, result): + return False + + def stop_all(self): + return time.time() - self._start > self._deadline + + +tf = try_import_tf() + +ModelCatalog.register_custom_model("CustomFCModel", CustomFCModel) + +rllib_agents = {} + +shared_interface = AgentInterface( + max_episode_steps=1500, + neighborhood_vehicles=True, + waypoints=True, + action=ActionSpaceType.LaneWithContinuousSpeed, +) +shared_interface.done_criteria = DoneCriteria( + off_route=False, + wrong_way=False, + collision=True, +) + +for agent_id in PREDATOR_IDS: + rllib_agents[agent_id] = { + "agent_spec": AgentSpec( + interface=shared_interface, + agent_builder=lambda: TagModelAgent( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "model"), + OBSERVATION_SPACE, + ), + observation_adapter=observation_adapter, + reward_adapter=predator_reward_adapter, + action_adapter=action_adapter, + ), + "observation_space": OBSERVATION_SPACE, + "action_space": ACTION_SPACE, + } + +for agent_id in PREY_IDS: + rllib_agents[agent_id] = { + "agent_spec": AgentSpec( + interface=shared_interface, + agent_builder=lambda: TagModelAgent( + os.path.join(os.path.dirname(os.path.realpath(__file__)), "model"), + OBSERVATION_SPACE, + ), + observation_adapter=observation_adapter, + reward_adapter=prey_reward_adapter, + action_adapter=action_adapter, + ), + "observation_space": OBSERVATION_SPACE, + "action_space": ACTION_SPACE, + } + + +def build_tune_config(scenario, headless=True, sumo_headless=False): + rllib_policies = { + policy_mapper(agent_id): ( + None, + rllib_agent["observation_space"], + rllib_agent["action_space"], + {"model": {"custom_model": "CustomFCModel"}}, + ) + for agent_id, rllib_agent in rllib_agents.items() + } + + tune_config = { + "env": RLlibHiWayEnv, + "framework": "torch", + "log_level": "WARN", + "num_workers": 3, + "explore": True, + "horizon": 10000, + "env_config": { + "seed": 42, + "sim_name": "game_of_tag_works?", + "scenarios": [os.path.abspath(scenario)], + "headless": headless, + "sumo_headless": sumo_headless, + "agent_specs": { + agent_id: rllib_agent["agent_spec"] + for agent_id, rllib_agent in rllib_agents.items() + }, + }, + "multiagent": { + "policies": rllib_policies, + "policies_to_train": [PREDATOR_POLICY, PREY_POLICY], + "policy_mapping_fn": policy_mapper, + }, + "callbacks": { + "on_episode_start": on_episode_start, + "on_episode_step": on_episode_step, + "on_episode_end": on_episode_end, + }, + } + return tune_config + + +def main(args): + pbt = PopulationBasedTraining( + time_attr="time_total_s", + metric="episode_reward_mean", + mode="max", + perturbation_interval=300, + resample_probability=0.25, + # Specifies the mutations of these hyperparams + hyperparam_mutations={ + "lambda": lambda: random.uniform(0.9, 1.0), + "clip_param": lambda: random.uniform(0.01, 0.5), + "kl_coeff": lambda: 0.3, + "lr": [1e-3], + "sgd_minibatch_size": lambda: 128, + "train_batch_size": lambda: 4000, + "num_sgd_iter": lambda: 30, + }, + custom_explore_fn=explore, + ) + local_dir = os.path.expanduser(args.result_dir) + + tune_config = build_tune_config(args.scenario) + + tune.run( + PPOTrainer, # Rllib supports using PPO in multi-agent setting + name="lets_play_tag", + stop=TimeStopper(), + # XXX: Every X iterations perform a _ray actor_ checkpoint (this is + # different than _exporting_ a TF/PT checkpoint). + checkpoint_freq=5, + checkpoint_at_end=True, + # XXX: Beware, resuming after changing tune params will not pick up + # the new arguments as they are stored alongside the checkpoint. + resume=args.resume_training, + # restore="path_to_training_checkpoint/checkpoint_x/checkpoint-x", + local_dir=local_dir, + reuse_actors=True, + max_failures=0, + export_formats=["model", "checkpoint"], + config=tune_config, + scheduler=pbt, + ) + + # # To output a model + # # 1: comment out tune.run and uncomment the following code + # # 2: replace checkpoint path to training checkpoint path + # # 3: inject code in rllib according to README.md and run + # checkpoint_path = os.path.join( + # os.path.dirname(os.path.realpath(__file__)), "models/checkpoint_360/checkpoint-360" + # ) + # ray.init(num_cpus=2) + # training_agent = PPOTrainer(env=RLlibHiWayEnv,config=tune_config) + # training_agent.restore(checkpoint_path) + # prefix = "model.ckpt" + # model_dir = os.path.join( + # os.path.dirname(os.path.realpath(__file__)), "models/predator_model" + # ) + # training_agent.export_policy_model(model_dir, PREDATOR_POLICY) + # model_dir = os.path.join( + # os.path.dirname(os.path.realpath(__file__)), "models/prey_model" + # ) + # training_agent.export_policy_model(model_dir, PREY_POLICY) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("rllib-example") + parser.add_argument( + "scenario", + type=str, + help="Scenario to run (see scenarios/ for some samples you can use)", + ) + parser.add_argument( + "--resume_training", + default=False, + action="store_true", + help="Resume the last trained example", + ) + parser.add_argument( + "--result_dir", + type=str, + default="~/ray_results", + help="Directory containing results (and checkpointing)", + ) + args = parser.parse_args() + main(args) diff --git a/examples/game_of_tag/model.py b/examples/game_of_tag/model.py new file mode 100644 index 0000000000..17d0f595bb --- /dev/null +++ b/examples/game_of_tag/model.py @@ -0,0 +1,37 @@ +import torch, gym +from torch import nn +from torch.distributions.normal import Normal +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet + + +class CustomFCModel(TorchModelV2, nn.Module): + """Example of interpreting repeated observations.""" + + def __init__( + self, + obs_space: gym.spaces.Space, + action_space: gym.spaces.Space, + num_outputs: int, + model_config, + name: str, + ): + super(CustomFCModel, self).__init__( + obs_space=obs_space, + action_space=action_space, + num_outputs=num_outputs, + model_config=model_config, + name=name, + ) + nn.Module.__init__(self) + + self.model = TorchFCNet( + obs_space, action_space, num_outputs, model_config, name + ) + + def forward(self, input_dict, state, seq_lens): + + return self.model.forward(input_dict, state, seq_lens) + + def value_function(self): + return self.model.value_function() diff --git a/examples/game_of_tag/models/checkpoint_360/.is_checkpoint b/examples/game_of_tag/models/checkpoint_360/.is_checkpoint new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/game_of_tag/models/checkpoint_360/checkpoint-360 b/examples/game_of_tag/models/checkpoint_360/checkpoint-360 new file mode 100644 index 0000000000..5be92c78cf Binary files /dev/null and b/examples/game_of_tag/models/checkpoint_360/checkpoint-360 differ diff --git a/examples/game_of_tag/models/checkpoint_360/checkpoint-360.tune_metadata b/examples/game_of_tag/models/checkpoint_360/checkpoint-360.tune_metadata new file mode 100644 index 0000000000..29566817d4 Binary files /dev/null and b/examples/game_of_tag/models/checkpoint_360/checkpoint-360.tune_metadata differ diff --git a/examples/game_of_tag/models/predator_model/model.pt b/examples/game_of_tag/models/predator_model/model.pt new file mode 100644 index 0000000000..54671a2bff Binary files /dev/null and b/examples/game_of_tag/models/predator_model/model.pt differ diff --git a/examples/game_of_tag/models/prey_model/model.pt b/examples/game_of_tag/models/prey_model/model.pt new file mode 100644 index 0000000000..54671a2bff Binary files /dev/null and b/examples/game_of_tag/models/prey_model/model.pt differ diff --git a/examples/game_of_tag/run_checkpoint.py b/examples/game_of_tag/run_checkpoint.py new file mode 100644 index 0000000000..fcd6b62b8f --- /dev/null +++ b/examples/game_of_tag/run_checkpoint.py @@ -0,0 +1,213 @@ +"""Let's play tag! + +A predator-prey multi-agent example built on top of RLlib to facilitate further +developments on multi-agent support for HiWay (including design, performance, +research, and scaling). + +The predator and prey use separate policies. A predator "catches" its prey when +it collides into the other vehicle. There can be multiple predators and +multiple prey in a map. Social vehicles act as obstacles where both the +predator and prey must avoid them. +""" +import argparse +import os +import random +import multiprocessing + +import gym +import numpy as np +import ray +from ray import tune +from ray.rllib.models import ModelCatalog +from ray.rllib.utils import try_import_tf +from ray.tune.schedulers import PopulationBasedTraining +from ray.rllib.models.tf.fcnet import FullyConnectedNetwork +from ray.rllib.agents.ppo import PPOTrainer + +from examples.game_of_tag.game_of_tag import shared_interface, build_tune_config +from examples.game_of_tag.model import CustomFCModel +from examples.game_of_tag.tag_adapters import ( + OBSERVATION_SPACE, + PREDATOR_IDS, + PREY_IDS, + observation_adapter, + predator_reward_adapter, + prey_reward_adapter, +) + +from smarts.env.rllib_hiway_env import RLlibHiWayEnv +from smarts.core.agent import AgentSpec, Agent +from smarts.core.agent_interface import AgentInterface, AgentType, DoneCriteria +from smarts.core.utils.episodes import episodes +from smarts.core.controllers import ActionSpaceType + +tf = try_import_tf()[1] + +# must use >3 cpus since training used 3 workers +ray.init(num_cpus=4) + + +ModelCatalog.register_custom_model("CustomFCModel", CustomFCModel) + + +def action_adapter(model_action): + """Take in the action calculated by the model, and transform it to something that + SMARTS can understand. + + The model returns a batched action (since it received a batched input). That is, the + action consists of actions for however many observations were passed to it in the + batch of observations it was given. We only gave it a batch of 1 observation in the + act(...) method of TagModelAgent. + + The model outputs an action in the form of: + ( + ( + array([...]), # The speed. + array([...]), # The lane change. + ), + [], + { + '...': array([...]), + '...': array([[...]]), + '...': array([...]), + '...': array([...]) + } + ) + + The action we care about is the first element of this tuple, get it with + model_action[0], so that speed = array([...]) and laneChange = array([...]). Convert + these arrays to scalars to index into speeds or subtract from it. + """ + speed, laneChange = model_action[0] + speeds = [0, 3, 6, 9] + adapted_action = [speeds[speed.item()], laneChange.item() - 1] + return adapted_action + + +class TagModelAgent(Agent): + def __init__(self, checkpoint_path, scenario, headless, policy_name): + assert os.path.isfile(checkpoint_path) + tune_config = build_tune_config(scenario, headless=headless) + self.agent = PPOTrainer(env=RLlibHiWayEnv, config=tune_config) + self.agent.restore(checkpoint_path) + self._policy_name = policy_name + self._prep = ModelCatalog.get_preprocessor_for_space(OBSERVATION_SPACE) + + def act(self, observations): + """Receive an observation from the environment, and compute the agent's action. + + The observation is a dictionary of an observation for a single agent. However, + the model expects a batched observation, that is, a list of observations. To fix + this, expand the dimensions of the observation from (n,) to (1, n) so that the + observation fits into the model's expected input size. + """ + obs = self._prep.transform(observations) + obs = np.expand_dims(obs, 0) + action = self.agent.get_policy(self._policy_name).compute_actions(obs) + return action + + +def main(scenario, headless, checkpoint_path, seed, num_episodes): + agent_specs = {} + + for agent_id in PREDATOR_IDS: + agent_specs[agent_id] = AgentSpec( + interface=shared_interface, + agent_builder=lambda: TagModelAgent( + checkpoint_path, # assumes checkpoint exists + scenario, + headless, + "predator_policy", + ), + observation_adapter=observation_adapter, + reward_adapter=predator_reward_adapter, + action_adapter=action_adapter, + ) + + for agent_id in PREY_IDS: + agent_specs[agent_id] = AgentSpec( + interface=shared_interface, + agent_builder=lambda: TagModelAgent( + checkpoint_path, # assumes checkpoint exists + scenario, + headless, + "prey_policy", + ), + observation_adapter=observation_adapter, + reward_adapter=prey_reward_adapter, + action_adapter=action_adapter, + ) + + env = gym.make( + "smarts.env:hiway-v0", + scenarios=[scenario], + agent_specs=agent_specs, + sim_name="test_game_of_tag", + headless=True, + sumo_headless=False, + seed=seed, + ) + + agents = { + agent_id: agent_spec.build_agent() + for agent_id, agent_spec in agent_specs.items() + } + + for episode in episodes(n=num_episodes): + observations = env.reset() + episode.record_scenario(env.scenario_log) + + dones = {"__all__": False} + while not dones["__all__"]: + actions = { + agent_id: agents[agent_id].act(agent_obs) + for agent_id, agent_obs in observations.items() + } + + observations, rewards, dones, infos = env.step(actions) + episode.record_step(observations, rewards, dones, infos) + # TODO temporary solution for game of tag: stop the episode when 1 vehicle is done + # so that the other vehicle does not train when the opponent is not present, which + # causes noisy in training + for key in dones: + if dones[key]: + dones["__all__"] = True + + env.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("game-of-tag-example") + parser.add_argument( + "scenario", + type=str, + help="Scenario to run (see scenarios/ for some samples you can use)", + ) + parser.add_argument( + "--headless", help="run simulation in headless mode", action="store_true" + ) + parser.add_argument( + "--checkpoint_path", + help="run simulation in headless mode", + type=str, + default=os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "models/checkpoint_360/checkpoint-360", + ), + ) + parser.add_argument( + "--num_episodes", + help="number of episodes to show", + type=int, + default=10, + ) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args() + + main( + scenario=args.scenario, + headless=args.headless, + checkpoint_path=args.checkpoint_path, + seed=args.seed, + num_episodes=args.num_episodes, + ) diff --git a/examples/game_of_tag/scenarios/game_of_tag_demo_map/map.net.xml b/examples/game_of_tag/scenarios/game_of_tag_demo_map/map.net.xml new file mode 100644 index 0000000000..f4fb40dc79 --- /dev/null +++ b/examples/game_of_tag/scenarios/game_of_tag_demo_map/map.net.xml @@ -0,0 +1,267 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/game_of_tag/scenarios/game_of_tag_demo_map/scenario.py b/examples/game_of_tag/scenarios/game_of_tag_demo_map/scenario.py new file mode 100644 index 0000000000..e0487755fb --- /dev/null +++ b/examples/game_of_tag/scenarios/game_of_tag_demo_map/scenario.py @@ -0,0 +1,42 @@ +import random +from pathlib import Path + +from smarts.sstudio import gen_scenario +from smarts.sstudio import types as t +from smarts.core import seed + +seed(42) + +# traffic = t.Traffic( +# flows=[ +# t.Flow( +# route=t.Route( +# begin=("-gneE69", 0, 10), +# end=("gneE77", 0, 0), +# ), +# rate=60*60, +# actors={ +# t.TrafficActor( +# name="car", +# vehicle_type=random.choice( +# ["passenger", "bus", "coach", "truck", "trailer"] +# ), +# ): 1 +# }, +# ) +# ] +# ) + +# training missions +ego_missions = [ + t.EndlessMission(begin=("top", 2, 5)), # pred + t.EndlessMission(begin=("top", 2, 30)), # prey +] + + +scenario = t.Scenario( + # traffic={"all": traffic}, + ego_missions=ego_missions, +) + +gen_scenario(scenario, output_dir=str(Path(__file__).parent)) diff --git a/examples/game_of_tag/scenarios/game_of_tag_demo_map/shifted_map-AUTOGEN.net.xml b/examples/game_of_tag/scenarios/game_of_tag_demo_map/shifted_map-AUTOGEN.net.xml new file mode 100644 index 0000000000..d7eade2f4c --- /dev/null +++ b/examples/game_of_tag/scenarios/game_of_tag_demo_map/shifted_map-AUTOGEN.net.xml @@ -0,0 +1,267 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/game_of_tag/tag_adapters.py b/examples/game_of_tag/tag_adapters.py new file mode 100644 index 0000000000..5ddb0ed997 --- /dev/null +++ b/examples/game_of_tag/tag_adapters.py @@ -0,0 +1,224 @@ +import gym +import numpy as np +import random +import math +from typing import List +import time +from dataclasses import dataclass + +PREDATOR_IDS = ["PRED1"] +PREY_IDS = ["PREY1"] + + +@dataclass +class Rewards: + collesion_with_target: float = 10 + offroad: float = 10 + collesion_with_other_deduction: float = -1.5 + + +global_rewards = Rewards() + +# vehicles collide at around 3.8 if from behind +# colldie at 2.11 if from side +COLLIDE_DISTANCE = 3.8 + +ACTION_SPACE = gym.spaces.Tuple( + ( + gym.spaces.Discrete(4), # 4 types of speed + gym.spaces.Discrete(3), # -1 0 or 1 for lane change + ) +) + +NEIGHBORHOOD_VEHICLE_STATES = gym.spaces.Dict( + { + "heading": gym.spaces.Box(low=-2 * np.pi, high=2 * np.pi, shape=(1,)), + "speed": gym.spaces.Box(low=-2e2, high=2e2, shape=(1,)), + "position": gym.spaces.Box(low=-1e4, high=1e4, shape=(2,)), + "distance": gym.spaces.Box(low=0, high=1e3, shape=(1,)), + "lane_index": gym.spaces.Discrete(5), + } +) + +OBSERVATION_SPACE = gym.spaces.Dict( + { + "heading": gym.spaces.Box(low=-1 * np.pi, high=np.pi, shape=(1,)), + "speed": gym.spaces.Box(low=0, high=1e3, shape=(1,)), + "position": gym.spaces.Box(low=-1e3, high=1e3, shape=(2,)), + "lane_index": gym.spaces.Discrete(5), + "target_vehicles": gym.spaces.Tuple( + tuple([NEIGHBORHOOD_VEHICLE_STATES] * len(PREDATOR_IDS)) + ), + } +) + + +def action_adapter(model_action): + speed, laneChange = model_action + speeds = [0, 3, 6, 9] + adapted_action = [speeds[speed], laneChange - 1] + return adapted_action + + +def _is_vehicle_wanted(id, wanted_ids: List[str]): + """This function is needed since agent-id during training would be + 'PREY1-xxxxxxxx' instead of 'PREY1' + """ + for wanted_id in wanted_ids: + if wanted_id in id: + return True + return False + + +def get_specific_vehicle_states(nv_states, wanted_ids: List[str], ego_state): + """return vehicle states of vehicle that has id in wanted_ids""" + states = [ + { + "heading": np.array([v.heading]), + "speed": np.array([v.speed]), + "position": np.array(v.position[:2]), + "lane_index": v.lane_index, + "distance": np.array( + [np.linalg.norm(v.position[:2] - ego_state.position[:2])] + ), + } + for v in nv_states + if _is_vehicle_wanted(v.id, wanted_ids) + ] + # ego is predator, prey went off road + if wanted_ids == PREY_IDS: + # make the last observation bad for prey to discourage off road + states += [ + { + "heading": np.array([0]), + "speed": np.array([0]), + "position": ego_state.position[:2], + "lane_index": ego_state.lane_index, + "distance": np.array([COLLIDE_DISTANCE]), # give max reward to predator + } + ] * (len(wanted_ids) - len(states)) + elif wanted_ids == PREDATOR_IDS: + # ego is prey, predator went off road + # make the last observation bad for predator + states += [ + { + "heading": np.array([0]), + "speed": np.array([0]), + "position": np.array([1000, 1000]), + "lane_index": ego_state.lane_index, + "distance": np.array([1e3 - 1]), # makes position far from predator + } + ] * (len(wanted_ids) - len(states)) + + return states + + +def min_distance_to_rival(ego_position, rival_ids, neighbour_states): + rival_vehicles = filter( + lambda v: _is_vehicle_wanted(v.id, rival_ids), neighbour_states + ) + rival_positions = [p.position for p in rival_vehicles] + + return min( + [np.linalg.norm(ego_position - prey_pos) for prey_pos in rival_positions], + default=0, + ) + + +def observation_adapter(observations): + nv_states = observations.neighborhood_vehicle_states + ego = observations.ego_vehicle_state + + target_vehicles = None + if _is_vehicle_wanted(ego.id, PREY_IDS): + target_vehicles = get_specific_vehicle_states(nv_states, PREDATOR_IDS, ego) + elif _is_vehicle_wanted(ego.id, PREDATOR_IDS): + target_vehicles = get_specific_vehicle_states(nv_states, PREY_IDS, ego) + + return { + "heading": np.array([ego.heading]), + "speed": np.array([ego.speed]), + "position": np.array(ego.position[:2]), + "lane_index": ego.lane_index, + "target_vehicles": tuple(target_vehicles), + } + + +def dominant_reward(distance): + if distance == COLLIDE_DISTANCE: + return 10 + return min(0.5 / ((distance - COLLIDE_DISTANCE) ** 2), 10) + + +def predator_reward_adapter(observations, env_reward_signal): + rew = 0 + ego = observations.ego_vehicle_state + + # Primary reward + distance_to_target = min_distance_to_rival( + ego.position, + PREY_IDS, + observations.neighborhood_vehicle_states, + ) + + rew += dominant_reward(distance_to_target) + + events = observations.events + for c in observations.events.collisions: + if _is_vehicle_wanted(c.collidee_id, PREY_IDS): + rew += global_rewards.collesion_with_target + print( + f"predator {ego.id} collided with prey {c.collidee_id} distance {distance_to_target}" + ) + # # keeping this commented code for expanding to mutiple preys and predators in the future + # else: + # # Collided with something other than the prey + # rew += global_rewards.collesion_with_other_deduction + # print(f"predator {ego.id} collided with others {c.collidee_id}") + + if events.off_road: + rew -= global_rewards.offroad + + # if no prey vehicle avaliable, have 0 reward instead + # TODO: Test to see if this is neccessary + prey_vehicles = list(filter( + lambda v: _is_vehicle_wanted(v.id, PREY_IDS), observations.neighborhood_vehicle_states, + )) + return rew if len(prey_vehicles) > 0 else 0 + + +def prey_reward_adapter(observations, env_reward_signal): + + rew = 0 + ego = observations.ego_vehicle_state + + # Primary reward + distance_to_target = min_distance_to_rival( + ego.position, + PREDATOR_IDS, + observations.neighborhood_vehicle_states, + ) + rew -= dominant_reward(distance_to_target) + + events = observations.events + for c in events.collisions: + if _is_vehicle_wanted(c.collidee_id, PREDATOR_IDS): + rew -= global_rewards.collesion_with_target + print( + f"prey {ego.id} collided with Predator {c.collidee_id} distance {distance_to_target}" + ) + # # keeping this commented code for expanding to mutiple preys and predators in the future + # else: + # # Collided with something other than the prey + # rew += global_rewards.collesion_with_other_deduction + # print(f"prey {ego.id} collided with other vehicle {c.collidee_id}") + + if events.off_road: + rew -= global_rewards.offroad + + # if no predator vehicle avaliable, have 0 reward instead + # TODO: Test to see if this is neccessary + predator_vehicles = list(filter( + lambda v: _is_vehicle_wanted(v.id, PREDATOR_IDS), observations.neighborhood_vehicle_states, + )) + return rew if len(predator_vehicles) > 0 else 0 diff --git a/smarts/core/agent_interface.py b/smarts/core/agent_interface.py index 35d16840f7..c75e353241 100644 --- a/smarts/core/agent_interface.py +++ b/smarts/core/agent_interface.py @@ -109,7 +109,7 @@ class AgentType(IntEnum): """All observations and continuous action space""" Standard = 2 """Minimal observations for dealing with waypoints and other vehicles and - continuous action space. + ActuatorDynamic action space. """ Laner = 3 """Agent sees waypoints and performs lane actions""" diff --git a/smarts/core/controllers/lane_following_controller.py b/smarts/core/controllers/lane_following_controller.py index 411effa4b0..24dd9d21a5 100644 --- a/smarts/core/controllers/lane_following_controller.py +++ b/smarts/core/controllers/lane_following_controller.py @@ -75,6 +75,15 @@ def perform_lane_following( lane_change=0, ): assert isinstance(vehicle.chassis, AckermannChassis) + assert isinstance(lane_change, int) or isinstance( + lane_change, np.integer + ), "lane_change action should be an integer" + assert ( + lane_change == 1 or lane_change == 0 or lane_change == -1 + ), """lane_change action should be any of the following: +-1: change to right right +0: stay on same lane, +1: change to left lane""" state = controller_state # This lookahead value is coupled with a few calculations below, changing it # may affect stability of the controller. @@ -179,7 +188,7 @@ def perform_lane_following( # directly related to the steering angle, this is added to further # enhance the speed tracking performance. TODO: currently, the bullet # does not provide the lateral acceleration which is needed for - # calculating the front laterl force. we need to replace the coefficent + # calculating the front lateral force. we need to replace the coefficent # with better approximation of the front lateral forces using explicit # differention. lateral_force_coefficient = 1.5 diff --git a/smarts/env/rllib_hiway_env.py b/smarts/env/rllib_hiway_env.py index 0660e0c30d..9ab781b32d 100644 --- a/smarts/env/rllib_hiway_env.py +++ b/smarts/env/rllib_hiway_env.py @@ -80,9 +80,11 @@ def __init__(self, config): ) self._sim_name = config.get("sim_name", None) - self._headless = config.get("headless", False) + # Warnining: running rllib with envision will cause memory to run out very quickly. + # It is recommanded to set headless to true during training and use sumo-gui (sumo_headless=False) + self._headless = config.get("headless", True) self._num_external_sumo_clients = config.get("num_external_sumo_clients", 0) - self._sumo_headless = config.get("sumo_headless", True) + self._sumo_headless = config.get("sumo_headless", False) self._sumo_port = config.get("sumo_port") self._sumo_auto_start = config.get("sumo_auto_start", True) self._endless_traffic = config.get("endless_traffic", True) @@ -147,7 +149,13 @@ def step(self, agent_actions): for done in dones.values(): self._dones_registered += 1 if done else 0 - dones["__all__"] = self._dones_registered == len(self._agent_specs) + # TODO temporary solution for game of tag: stop the episode when 1 vehicle is done + # so that the other vehicle does not train when the opponent is not present, which + # causes noisy in training + dones["__all__"] = self._dones_registered > 0 + if dones["__all__"]: + for id in dones: + dones[id] = True return observations, rewards, dones, infos