Last active
December 23, 2024 15:52
-
-
Save tsvikas/c4d0411b8dd821fb9d5eeba678f28d5b to your computer and use it in GitHub Desktop.
Gym debugging enviroments from Andy Jones's blog [https://andyljones.com/posts/rl-debugging.html]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
based on https://andyljones.com/posts/rl-debugging.html | |
Documentation is quoted from that blogpost. | |
The usual advice to people writing RL algorithms is to use a simple environment | |
like the classic control ones from the Gym. Thing is, these envs have the same | |
problem as looking at loss curves: at best they give you a noisy indicator, and | |
if the noisy indicator looks poor you don't know why it looks poor. They don't | |
localise errors. | |
Instead, construct environments that do localise errors. | |
""" | |
from abc import ABC | |
import gymnasium as gym | |
import numpy as np | |
class BasicEnv(gym.Env, ABC): | |
""" | |
Base class of a Basic Env with discrete action space, and a 1d-numpy | |
discrete observation space | |
The default observation is random, and there is no default reward. | |
After reset(), info["history"] will have the full history of the episode. | |
Tracks useful properties: | |
self.timestep, self.observation_history, self.action_history, | |
self.reward_history, self.terminated | |
""" | |
def __init__( | |
self, | |
n_actions=1, | |
n_observations=1, | |
episode_len=1, | |
obs_dtype=np.float32, | |
reward_type=float, | |
): | |
super().__init__() | |
self.action_space = gym.spaces.Discrete(n_actions) | |
# we use MultiDiscrete with size 1 to get a np.array with size 1 | |
self.observation_space = gym.spaces.MultiDiscrete([n_observations]) | |
self.n_actions = n_actions | |
self.n_observations = n_observations | |
self.episode_len = episode_len | |
self.obs_dtype = obs_dtype | |
self.reward_type = reward_type | |
@property | |
def possible_observations(self): | |
"""List of all possible observations""" | |
return [ | |
np.array([i]).astype(self.obs_dtype) for i in range(self.n_observations) | |
] | |
def _get_obs(self, is_reset: bool): | |
return self.observation_space.sample().astype(self.obs_dtype) | |
def _get_reward(self, action): | |
raise NotImplementedError | |
def _is_terminated(self): | |
return self.timestep == self.episode_len | |
def _get_info(self): | |
return {} | |
def reset(self, seed=None, options=None): | |
super().reset(seed=seed) | |
self.timestep = 0 | |
observation = self._get_obs(is_reset=True) | |
info = self._get_info() | |
info["history"] = { | |
"observation": self.observation_history, | |
"action": self.action_history, | |
"reward": self.reward_history, | |
} | |
self.observation_history = [observation] | |
self.action_history = [] | |
self.reward_history = [] | |
self.terminated = False | |
return observation, info | |
def step(self, action): | |
if self.terminated: | |
raise RuntimeError("don't step a terminated enviroment") | |
self.timestep += 1 | |
observation = self._get_obs(is_reset=False) | |
reward = self._get_reward(action) | |
terminated = self._is_terminated() | |
info = self._get_info() | |
self.observation_history.append(observation) | |
self.action_history.append(action) | |
self.reward_history.append(reward) | |
self.terminated = terminated | |
return observation, reward, terminated, False, info | |
def render(self): | |
pass | |
class ConstRewardEnv(BasicEnv): | |
""" | |
+1 reward every time | |
This isolates the value network. | |
If my agent can't learn that the value of the only observation it ever sees | |
is 1, there's a problem with the value loss calculation or the optimizer | |
Expected value (obs -> value for each action): | |
[0] -> [1.0] | |
""" | |
version = 0 | |
def __init__(self, n_actions=1, n_observations=1, episode_len=1): | |
super().__init__( | |
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len | |
) | |
def _get_reward(self, action): | |
return self.reward_type(1.0) | |
class ObservedRewardEnv(BasicEnv): | |
""" | |
obs-dependent reward every time | |
If my agent can learn the value in ConstRewardEnv but not this one, meaning | |
it can learn a constant reward but not a predictable one, it must be that | |
backpropagation through my network is broken. | |
Expected value (obs -> value for each action): | |
[0] -> [0.0] | |
[1] -> [1.0] | |
""" | |
version = 0 | |
def __init__(self, n_actions=1, n_observations=2, episode_len=1): | |
super().__init__( | |
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len | |
) | |
def _get_reward(self, action): | |
return self.reward_type(self.observation_history[-1] != 0) | |
class FutureRewardEnv(BasicEnv): | |
""" | |
two timesteps long, +1 reward at the end | |
If my agent can learn the value in ObservedRewardEnv but not this one, it | |
must be that my reward discounting is broken. | |
Expected value (obs -> value for each action): | |
[0] -> [1.0 * discount_rate] | |
[1] -> [1.0] | |
""" | |
version = 0 | |
def __init__(self, n_actions=1, episode_len=2): | |
super().__init__( | |
n_actions=n_actions, n_observations=episode_len, episode_len=episode_len | |
) | |
def _get_obs(self, is_reset: bool): | |
return np.array([self.timestep], dtype=self.obs_dtype) | |
def _get_reward(self, action): | |
return self.reward_type(self._is_terminated()) | |
class ActionRewardEnv(BasicEnv): | |
""" | |
action-dependent reward | |
The first env to exercise the policy. If my agent can't learn to pick the | |
better action, there's something wrong with either my advantage | |
calculations, my policy loss or my policy update. | |
That's three things, but it's easy to work out by hand the expected values | |
for each one and check that the values produced by your actual code line up | |
with them. | |
Expected value (obs -> value for each action): | |
[0] -> [0.0, 1.0] | |
""" | |
version = 0 | |
def __init__(self, n_actions=2, n_observations=1, episode_len=1): | |
super().__init__( | |
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len | |
) | |
def _get_reward(self, action): | |
return self.reward_type(action != 0) | |
class ActionObservationRewardEnv(BasicEnv): | |
""" | |
action-and-obs dependent reward | |
Now we've got a dependence on both obs and action. The policy and value | |
networks interact here, so there's a couple of things to verify: | |
that the policy network learns to pick the right action in each of the two | |
states, and that the value network learns that the value of each state | |
is +1. | |
If everything's worked up until now, then if - for example - the value | |
network fails to learn here, it likely means your batching process is | |
feeding the value network stale experience. | |
Expected value (obs -> value for each action): | |
[0] -> [0.0, 1.0] | |
[1] -> [1.0, 0.0] | |
""" | |
version = 0 | |
def __init__(self, n_actions=2, n_observations=2, episode_len=1): | |
super().__init__( | |
n_actions=n_actions, n_observations=n_observations, episode_len=episode_len | |
) | |
def _get_reward(self, action): | |
return self.reward_type(action != self.observation_history[-1]) | |
for env in [ | |
ConstRewardEnv, | |
ObservedRewardEnv, | |
FutureRewardEnv, | |
ActionRewardEnv, | |
ActionObservationRewardEnv, | |
]: | |
gym.register(id=f"gym_probes/{env.__name__}-v{env.version}", entry_point=env) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment