camall3n · camall3n · Apr 22, 2024 · May 13, 2023 · May 13, 2023 · May 13, 2023
diff --git a/grl/agent/actorcritic.py b/grl/agent/actorcritic.py
diff --git a/grl/agent/analytical.py b/grl/agent/analytical.py
@@ -8,12 +8,28 @@
 import optax
 
 from grl.mdp import POMDP
-from grl.utils.loss import policy_discrep_loss, pg_objective_func
-from grl.utils.loss import mem_discrep_loss, mem_magnitude_td_loss, obs_space_mem_discrep_loss
-from grl.utils.math import glorot_init
+from grl.utils.policy import construct_aug_policy
+from grl.utils.loss import policy_discrep_loss, pg_objective_func, \
+    mem_pg_objective_func, unrolled_mem_pg_objective_func
+from grl.utils.loss import mem_discrep_loss, mem_bellman_loss, mem_tde_loss, obs_space_mem_discrep_loss
+from grl.utils.math import glorot_init, reverse_softmax
 from grl.utils.optimizer import get_optimizer
 from grl.vi import policy_iteration_step
 
+def new_pi_over_mem(pi_params: jnp.ndarray, add_n_mem_states: int,
+                    new_mem_pi: str = 'repeat'):
+    old_pi_params_shape = pi_params.shape
+
+    pi_params = pi_params.repeat(add_n_mem_states, axis=0)
+
+    if new_mem_pi == 'random':
+        # randomly init policy for new memory state
+        new_mem_params = glorot_init(old_pi_params_shape)
+        pi_params = pi_params.at[1::2].set(new_mem_params)
+
+    return pi_params
+
+
 class AnalyticalAgent:
     """
     Analytical agent that learns optimal policy params based on an
@@ -29,6 +45,7 @@ def __init__(self,
                  value_type: str = 'v',
                  error_type: str = 'l2',
                  objective: str = 'discrep',
+                 residual: bool = False,
                  lambda_0: float = 0.,
                  lambda_1: float = 1.,
                  alpha: float = 1.,
@@ -43,7 +60,7 @@ def __init__(self,
         :param mem_params: Memory parameters (optional)
         :param value_type: If we optimize lambda discrepancy, what type of lambda discrepancy do we optimize? (v | q)
         :param error_type: lambda discrepancy error type (l2 | abs)
-        :param objective: What objective are we trying to minimize? (discrep | magnitude)
+        :param objective: What objective are we trying to minimize? (discrep | bellman | tde)
         :param pi_softmax_temp: When we take the softmax over pi_params, what is the softmax temperature?
         :param policy_optim_alg: What type of policy optimization do we do? (pi | pg)
             (discrep_max: discrepancy maximization | discrep_min: discrepancy minimization
@@ -58,13 +75,18 @@ def __init__(self,
         self.og_n_obs = self.pi_params.shape[0]
 
         self.pg_objective_func = jit(pg_objective_func)
+        if self.policy_optim_alg == 'policy_mem_grad':
+            self.pg_objective_func = jit(mem_pg_objective_func)
+        elif self.policy_optim_alg == 'policy_mem_grad_unrolled':
+            self.pg_objective_func = jit(unrolled_mem_pg_objective_func)
 
         self.policy_iteration_update = jit(policy_iteration_step, static_argnames=['eps'])
         self.epsilon = epsilon
 
         self.val_type = value_type
         self.error_type = error_type
         self.objective = objective
+        self.residual = residual
         self.lambda_0 = lambda_0
         self.lambda_1 = lambda_1
         self.alpha = alpha
@@ -77,19 +99,29 @@ def __init__(self,
 
         self.new_mem_pi = new_mem_pi
 
-        self.optim_str = optim_str
-        # initialize optimizers
-        self.pi_lr = pi_lr
-        self.pi_optim = get_optimizer(optim_str, self.pi_lr)
-        self.pi_optim_state = self.pi_optim.init(self.pi_params)
-
         self.mem_params = None
         if mem_params is not None:
             self.mem_params = mem_params
+
+            if self.policy_optim_alg in ['policy_mem_grad', 'policy_mem_grad_unrolled']:
+                mem_probs, pi_probs = softmax(self.mem_params, -1), softmax(self.pi_params, -1)
+                aug_policy = construct_aug_policy(mem_probs, pi_probs)
+                self.pi_aug_params = reverse_softmax(aug_policy)
+
             self.mi_lr = mi_lr
             self.mem_optim = get_optimizer(optim_str, self.mi_lr)
             self.mem_optim_state = self.mem_optim.init(self.mem_params)
 
+        # initialize optimizers
+        self.optim_str = optim_str
+        self.pi_lr = pi_lr
+        self.pi_optim = get_optimizer(optim_str, self.pi_lr)
+
+        pi_params_to_optimize = self.pi_params
+        if self.policy_optim_alg in ['policy_mem_grad', 'policy_mem_grad_unrolled']:
+            pi_params_to_optimize = self.pi_aug_params
+        self.pi_optim_state = self.pi_optim.init(pi_params_to_optimize)
+
         self.pi_softmax_temp = pi_softmax_temp
 
         self.rand_key = rand_key
@@ -113,19 +145,25 @@ def init_and_jit_objectives(self):
         self.policy_discrep_objective_func = jit(partial_policy_discrep_loss)
 
         mem_loss_fn = mem_discrep_loss
+        partial_kwargs = {
+            'value_type': self.val_type,
+            'error_type': self.error_type,
+            'lambda_0': self.lambda_0,
+            'lambda_1': self.lambda_1,
+            'alpha': self.alpha,
+            'flip_count_prob': self.flip_count_prob
+        }
         if hasattr(self, 'objective'):
-            if self.objective == 'magnitude':
-                mem_loss_fn = mem_magnitude_td_loss
+            if self.objective == 'bellman':
+                mem_loss_fn = mem_bellman_loss
+                partial_kwargs['residual'] = self.residual
+            elif self.objective == 'tde':
+                mem_loss_fn = mem_tde_loss
+                partial_kwargs['residual'] = self.residual
             elif self.objective == 'obs_space':
                 mem_loss_fn = obs_space_mem_discrep_loss
 
-        partial_mem_discrep_loss = partial(mem_loss_fn,
-                                           value_type=self.val_type,
-                                           error_type=self.error_type,
-                                           lambda_0=self.lambda_0,
-                                           lambda_1=self.lambda_1,
-                                           alpha=self.alpha,
-                                           flip_count_prob=self.flip_count_prob)
+        partial_mem_discrep_loss = partial(mem_loss_fn, **partial_kwargs)
         self.memory_objective_func = jit(partial_mem_discrep_loss)
 
     @property
@@ -143,21 +181,17 @@ def reset_pi_params(self, pi_shape: Sequence[int] = None):
         if pi_shape is None:
             pi_shape = self.pi_params.shape
         self.pi_params = glorot_init(pi_shape)
+        self.pi_optim_state = self.pi_optim.init(self.pi_params)
 
     def new_pi_over_mem(self):
         if self.pi_params.shape[0] != self.og_n_obs:
             raise NotImplementedError(
                 "Have not implemented adding bits to already existing memory.")
 
         add_n_mem_states = self.mem_params.shape[-1]
-        old_pi_params_shape = self.pi_params.shape
-
-        self.pi_params = self.pi_params.repeat(add_n_mem_states, axis=0)
-
-        if self.new_mem_pi == 'random':
-            # randomly init policy for new memory state
-            new_mem_params = glorot_init(old_pi_params_shape)
-            self.pi_params = self.pi_params.at[1::2].set(new_mem_params)
+        self.pi_params = new_pi_over_mem(self.pi_params,
+                                         add_n_mem_states=add_n_mem_states,
+                                         new_mem_pi=self.new_mem_pi)
 
     @partial(jit, static_argnames=['self'])
     def policy_gradient_update(self, params: jnp.ndarray, optim_state: jnp.ndarray, pomdp: POMDP):
@@ -169,7 +203,7 @@ def policy_gradient_update(self, params: jnp.ndarray, optim_state: jnp.ndarray,
         params_grad = -params_grad
         updates, optimizer_state = self.pi_optim.update(params_grad, optim_state, params)
         params = optax.apply_updates(params, updates)
-        return v_0, td_v_vals, td_q_vals, params
+        return v_0, td_v_vals, td_q_vals, params, optimizer_state
 
     @partial(jit, static_argnames=['self', 'sign'])
     def policy_discrep_update(self,
@@ -187,12 +221,15 @@ def policy_discrep_update(self,
         updates, optimizer_state = self.pi_optim.update(params_grad, optim_state, params)
         params = optax.apply_updates(params, updates)
 
-        return loss, mc_vals, td_vals, params
+        return loss, mc_vals, td_vals, params, optimizer_state
 
     def policy_improvement(self, pomdp: POMDP):
-        if self.policy_optim_alg == 'policy_grad':
-            v_0, prev_td_v_vals, prev_td_q_vals, new_pi_params = \
-                self.policy_gradient_update(self.pi_params, self.pi_optim_state, pomdp)
+        if self.policy_optim_alg in ['policy_grad', 'policy_mem_grad', 'policy_mem_grad_unrolled']:
+            policy_params = self.pi_params
+            if self.policy_optim_alg in ['policy_mem_grad', 'policy_mem_grad_unrolled']:
+                policy_params = self.pi_aug_params
+            v_0, prev_td_v_vals, prev_td_q_vals, new_pi_params, new_optim_state= \
+                self.policy_gradient_update(policy_params, self.pi_optim_state, pomdp)
             output = {
                 'v_0': v_0,
                 'prev_td_q_vals': prev_td_q_vals,
@@ -201,17 +238,23 @@ def policy_improvement(self, pomdp: POMDP):
         elif self.policy_optim_alg == 'policy_iter':
             new_pi_params, prev_td_v_vals, prev_td_q_vals = self.policy_iteration_update(
                 self.pi_params, pomdp, eps=self.epsilon)
+            new_optim_state = self.pi_optim_state
             output = {'prev_td_q_vals': prev_td_q_vals, 'prev_td_v_vals': prev_td_v_vals}
         elif self.policy_optim_alg == 'discrep_max' or self.policy_optim_alg == 'discrep_min':
-            loss, mc_vals, td_vals, new_pi_params = self.policy_discrep_update(
+            loss, mc_vals, td_vals, new_pi_params, new_optim_state = self.policy_discrep_update(
                 self.pi_params,
                 self.pi_optim_state,
                 pomdp,
                 sign=(self.policy_optim_alg == 'discrep_max'))
             output = {'loss': loss, 'mc_vals': mc_vals, 'td_vals': td_vals}
         else:
             raise NotImplementedError
-        self.pi_params = new_pi_params
+
+        if self.policy_optim_alg in ['policy_mem_grad', 'policy_mem_grad_unrolled']:
+            self.pi_aug_params = new_pi_params
+        else:
+            self.pi_params = new_pi_params
+        self.pi_optim_state = new_optim_state
         return output
 
     @partial(jit, static_argnames=['self'])
@@ -224,13 +267,14 @@ def memory_update(self, params: jnp.ndarray, optim_state: jnp.ndarray, pi_params
         updates, optimizer_state = self.mem_optim.update(params_grad, optim_state, params)
         params = optax.apply_updates(params, updates)
 
-        return loss, params
+        return loss, params, optimizer_state
 
     def memory_improvement(self, pomdp: POMDP):
         assert self.mem_params is not None, 'I have no memory params'
-        loss, new_mem_params = self.memory_update(self.mem_params, self.mem_optim_state,
+        loss, new_mem_params, new_mem_optim_state = self.memory_update(self.mem_params, self.mem_optim_state,
                                                   self.pi_params, pomdp)
         self.mem_params = new_mem_params
+        self.mem_optim_state = new_mem_optim_state
         return loss
 
     def __getstate__(self) -> dict:
@@ -254,6 +298,10 @@ def __setstate__(self, state: dict):
 
         # restore jitted functions
         self.pg_objective_func = jit(pg_objective_func)
+        if self.policy_optim_alg == 'policy_mem_grad':
+            self.pg_objective_func = jit(mem_pg_objective_func)
+        elif self.policy_optim_alg == 'policy_mem_grad_unrolled':
+            self.pg_objective_func = jit(unrolled_mem_pg_objective_func)
         self.policy_iteration_update = jit(policy_iteration_step, static_argnames=['eps'])
 
         if 'optim_str' not in state:

diff --git a/grl/agent/td_lambda.py b/grl/agent/td_lambda.py
@@ -39,7 +39,17 @@ def _reset_q_values(self):
     def _reset_eligibility(self):
         self.eligibility = np.zeros((self.n_actions, self.n_obs))
 
-    def update(self, obs, action, reward, terminal, next_obs, next_action):
+    def update(
+            self,
+            obs,
+            action,
+            reward,
+            terminal,
+            next_obs,
+            next_action,
+            aug_obs=None,      # memory-augmented observation
+            next_aug_obs=None, # and next observation (O x M)
+        ):
         # Because mdp.step() terminates with probability (1-γ),
         # we have already factored in the γ that we would normally
         # use to decay the eligibility.
@@ -50,6 +60,11 @@ def update(self, obs, action, reward, terminal, next_obs, next_action):
         # probability γ.
         #
         # Thus we simply decay eligibility by λ.
+        if aug_obs is not None:
+            obs = aug_obs
+        if next_aug_obs is not None:
+            next_obs = next_aug_obs
+
         self.eligibility *= self.lambda_
         if self.trace_type == 'accumulating':
             self.eligibility[action, obs] += 1
@@ -84,7 +99,7 @@ def run_td_lambda_on_mdp(
     alpha=1,
     n_episodes=1000,
 ):
-    # If AMDP, convert to pi_ground
+    # If POMDP, convert to pi_ground
     if hasattr(mdp, 'phi'):
         pi_ground = mdp.get_ground_policy(pi)
     else:

diff --git a/grl/environment/__init__.py b/grl/environment/__init__.py
@@ -1,28 +1,77 @@
 from argparse import Namespace
 
-import jax
+import gymnasium as gym
 import numpy as np
+from numpy import random
+import popgym
+from popgym.wrappers import Flatten
 
 from .rocksample import RockSample
 from .spec import load_spec, load_pomdp
-from .wrappers import OneHotObservationWrapper, OneHotActionConcatWrapper
+from .wrappers import OneHotObservationWrapper, OneHotActionConcatWrapper, \
+    FlattenMultiDiscreteActionWrapper, DiscreteObservationWrapper, \
+    ContinuousToDiscrete, ArrayObservationWrapper
+
+def get_popgym_env(args: Namespace, rand_key: random.RandomState = None, **kwargs):
+    # check to see if name exists
+    env_names = set([e["id"] for e in popgym.envs.ALL.values()])
+    if args.spec not in env_names:
+        raise AttributeError(f"spec {args.spec} not found")
+    # wrappers fail unless disable_env_checker=True
+    env = gym.make(args.spec, disable_env_checker=True)
+    env.reset(seed=args.seed)
+    env.rand_key = rand_key
+    env.gamma = args.gamma
+
+    return env
 
 def get_env(args: Namespace,
             rand_state: np.random.RandomState = None,
-            rand_key: jax.random.PRNGKey = None,
+            action_bins: int = 6,
             **kwargs):
+    """
+    :param action_bins: If we have a continous action space, how many bins do we discretize to?
+    """
     # First we check our POMDP specs
     try:
         env, _ = load_pomdp(args.spec, rand_key=rand_state, **kwargs)
-
-        # TODO: some features are already encoded in a one-hot manner.
-        if args.feature_encoding == 'one_hot':
-            env = OneHotObservationWrapper(env)
     except AttributeError:
-        if args.spec == 'rocksample':
-            env = RockSample(rand_key=rand_key, **kwargs)
-        else:
-            raise NotImplementedError
+        # try to load from popgym
+        # validate input: we need a custom gamma for popgym args as they don't come with a gamma
+        if args.gamma is None:
+            raise AttributeError("Can't load non-native environments without passing in gamma!")
+        try:
+            env, _ = load_pomdp(args.spec, rand_key=rand_state, **kwargs)
+
+        except AttributeError:
+            # try to load from popgym
+            # validate input: we need a custom gamma for popgym args as they don't come with a gamma
+            if args.gamma is None:
+                raise AttributeError(
+                    "Can't load non-native environments without passing in gamma!")
+            try:
+                env = get_popgym_env(args, rand_key=rand_state, **kwargs)
+
+                env = Flatten(env)
+                # also might need to preprocess our observation spaces
+                if isinstance(env.observation_space, gym.spaces.Discrete)\
+                        and args.feature_encoding != 'one_hot':
+                    env = DiscreteObservationWrapper(env)
+                if isinstance(env.observation_space, gym.spaces.Tuple):
+                    env = ArrayObservationWrapper(env)
+
+                # preprocess continous action spaces
+                if isinstance(env.action_space, gym.spaces.Box):
+                    env = ContinuousToDiscrete(env, action_bins)
+                elif isinstance(env.action_space, gym.spaces.MultiDiscrete):
+                    env = FlattenMultiDiscreteActionWrapper(env)
+
+            except AttributeError:
+                # don't have anything else implemented
+                raise NotImplementedError
+
+    if args.feature_encoding == 'one_hot':
+        env = OneHotObservationWrapper(env)
 
     if args.action_cond == 'cat':
         env = OneHotActionConcatWrapper(env)