Skip to content

Commit 3ce485f

Browse files
Update seeding process on env.reset(); update np_random to be _np_random for the envs' PRNG; update Mujoco env version from v3 to v4; all in accordance with gym v0.29.0
1 parent febad86 commit 3ce485f

File tree

6 files changed

+150
-71
lines changed

6 files changed

+150
-71
lines changed

‎README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ pip install -r requirements.txt
4949
pip install -e .[extras_disc]
5050
```
5151

52-
Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco-py related installation errors below:
52+
Please follow the following commands to install for the continuous and complex experiments. **IMPORTANT**: In case, you do not have MuJoCo, please ignore any mujoco related installation errors below:
5353
```bash
5454
conda create -n py36_toy_rl_cont_comp python=3.6
5555
conda activate py36_toy_rl_cont_comp

‎example.py

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
one for grid environments with image representations
1212
one for wrapping Atari env qbert
1313
one for wrapping Mujoco env HalfCheetah
14-
one for wrapping MiniGrid env
15-
one for wrapping ProcGen env
14+
one for wrapping MiniGrid env # Currently commented out due to some errors
15+
one for wrapping ProcGen env # Currently commented out due to some errors
1616
two examples at the end showing how to create toy envs using gym.make()
1717
1818
Many further examples can be found in test_mdp_playground.py.
@@ -383,32 +383,98 @@ def atari_wrapper_example():
383383
display_image(next_state)
384384

385385

386-
def mujoco_wrapper_example():
386+
def mujoco_wrapper_examples():
387387

388+
# For Mujoco envs, a few specific dimensions need to be changed by fiddling with
389+
# attributes of the MujocoEnv class. This is achieved through a Mujoco
390+
# wrapper that subclasses the Mujoco env and modifies relevant properties.
391+
# Please see the documentation of mujoco_env_wrapper.py for more details.
392+
# Below, we specify 2 dicts: one for the specific dimensions that are changed
393+
# using the Mujoco wrapper and the other for the general dimensions that are
394+
# changed using a GymEnvWrapper.
395+
396+
# 1: Mujoco wrapper config:
388397
# The scalar values for the dimensions passed in this dict are used to
389398
# multiply the base environments' values. For these Mujoco envs, the
390399
# time_unit is achieved by multiplying the Gym Mujoco env's frame_skip and
391-
# thus will be the integer part of time_unit * frame_skip. The time_unit
400+
# thus will be the integer part of time_unit * frame_skip. (For HalfCheetah-v4
401+
# and Pusher-v4, frame_skip is 5; for Reacher-v4, it is 2.) The time_unit
392402
# is NOT achieved by changing Mujoco's timestep because that would change
393403
# the numerical integration done my Mujoco and thus the environment
394404
# dynamics.
395-
config = {
396-
"seed": 0,
405+
mujoco_wrap_config = {
397406
"action_space_max": 0.5,
398407
"time_unit": 0.5,
399408
}
400409

401-
# This actually makes a subclass and not a wrapper. Because, some
410+
# 2: Gym wrapper config:
411+
gym_wrap_config = {
412+
"seed": 0,
413+
"state_space_type": "continuous",
414+
"transition_noise": 0.25,
415+
}
416+
417+
418+
# This makes a subclass and not a wrapper because some
402419
# frameworks might need an instance of this class to also be an instance
403420
# of the Mujoco base_class.
404421
try:
405422
from mdp_playground.envs import get_mujoco_wrapper
406-
from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
407423

424+
# HalfCheetah example
425+
from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
408426
wrapped_mujoco_env = get_mujoco_wrapper(HalfCheetahEnv)
409427

410-
env = wrapped_mujoco_env(**config)
411-
state = env.reset()[0]
428+
env = wrapped_mujoco_env(**mujoco_wrap_config)
429+
430+
from mdp_playground.envs import GymEnvWrapper
431+
import gymnasium as gym
432+
env = GymEnvWrapper(env, **gym_wrap_config)
433+
434+
# From Gymnasium v26, the seed is set in the reset method.
435+
state = env.reset(seed=gym_wrap_config["seed"])[0]
436+
437+
print(
438+
"Taking a step in the environment with a random action and printing the transition:"
439+
)
440+
action = env.action_space.sample()
441+
next_state, reward, done, trunc, info = env.step(action)
442+
print("sars', done =", state, action, reward, next_state, done)
443+
444+
env.close()
445+
446+
# Pusher example
447+
from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
448+
wrapped_mujoco_env = get_mujoco_wrapper(PusherEnv)
449+
450+
env = wrapped_mujoco_env(**mujoco_wrap_config)
451+
452+
from mdp_playground.envs import GymEnvWrapper
453+
import gymnasium as gym
454+
env = GymEnvWrapper(env, **gym_wrap_config)
455+
456+
state = env.reset(seed=gym_wrap_config["seed"])[0]
457+
458+
print(
459+
"Taking a step in the environment with a random action and printing the transition:"
460+
)
461+
action = env.action_space.sample()
462+
next_state, reward, done, trunc, info = env.step(action)
463+
print("sars', done =", state, action, reward, next_state, done)
464+
465+
env.close()
466+
467+
# Reacher example
468+
from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
469+
wrapped_mujoco_env = get_mujoco_wrapper(ReacherEnv)
470+
471+
env = wrapped_mujoco_env(**mujoco_wrap_config)
472+
473+
from mdp_playground.envs import GymEnvWrapper
474+
import gymnasium as gym
475+
env = GymEnvWrapper(env, **gym_wrap_config)
476+
477+
state = env.reset(seed=gym_wrap_config["seed"])[0]
412478

413479
print(
414480
"Taking a step in the environment with a random action and printing the transition:"
@@ -424,7 +490,7 @@ def mujoco_wrapper_example():
424490
"Exception:",
425491
type(e),
426492
e,
427-
"caught. You may need to install mujoco-py. NOT running mujoco_wrapper_example.",
493+
"caught. You may need to install mujoco with pip. NOT running mujoco_wrapper_examples.",
428494
)
429495
return
430496

@@ -567,7 +633,7 @@ def procgen_wrapper_example():
567633
atari_wrapper_example()
568634

569635
print(set_ansi_escape + "\nRunning Mujoco wrapper example:\n" + reset_ansi_escape)
570-
mujoco_wrapper_example()
636+
mujoco_wrapper_examples()
571637

572638
print(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape)
573639
# minigrid_wrapper_example()
@@ -579,6 +645,7 @@ def procgen_wrapper_example():
579645
import mdp_playground
580646
import gymnasium as gym
581647

648+
# The following are with seed=None:
582649
gym.make("RLToy-v0")
583650

584651
env = gym.make(

‎mdp_playground/envs/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
from mdp_playground.envs.gym_env_wrapper import GymEnvWrapper
66
from mdp_playground.envs.mujoco_env_wrapper import get_mujoco_wrapper
77
except error.DependencyNotInstalled as e:
8-
print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco-py.")
8+
print("Exception:", type(e), e, "caught. You may need to install Ray or mujoco with pip.")

‎mdp_playground/envs/gym_env_wrapper.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,12 @@ def __init__(self, env, **config):
5555
# during the run of an env, the expectation is that all obs., act. space,
5656
# etc. seeds are set during that call? Only Atari in Gym seems to do something
5757
# similar, the others I saw there don't seem to set seed for obs., act. spaces.
58-
self.env.seed(
59-
seed_int
60-
) # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
61-
obs_space_seed = self.np_random.integers(sys.maxsize).item() # random
62-
act_space_seed = self.np_random.integers(sys.maxsize).item() # random
58+
if "seed" in dir(self.env): # hack
59+
self.env.seed(
60+
seed_int
61+
) # #seed ###IMP Apparently Atari also has a seed. :/ Without this, for beam_rider(?), about 1 in 5 times I got reward of 88.0 and 44.0 the remaining times with the same action sequence!! With setting this seed, I got the same reward of 44.0 when I ran about 20 times.; ##TODO If this is really a wrapper, should it be modifying the seed of the env?
62+
obs_space_seed = self._np_random.integers(sys.maxsize).item() # random
63+
act_space_seed = self._np_random.integers(sys.maxsize).item() # random
6364
self.env.observation_space.seed(obs_space_seed) # seed
6465
self.env.action_space.seed(act_space_seed) # seed
6566

@@ -207,7 +208,7 @@ def __init__(self, env, **config):
207208
# self.irrelevant_features = config["irrelevant_features"]
208209
irr_toy_env_conf = config["irrelevant_features"]
209210
if "seed" not in irr_toy_env_conf:
210-
irr_toy_env_conf["seed"] = self.np_random.integers(sys.maxsize).item() # random
211+
irr_toy_env_conf["seed"] = self._np_random.integers(sys.maxsize).item() # random
211212

212213
if config["state_space_type"] == "discrete":
213214
pass
@@ -340,15 +341,15 @@ def step(self, action):
340341
probs[action] = 1 - self.transition_noise
341342
old_action = action
342343
action = int(
343-
self.np_random.choice(self.env.action_space.n, size=1, p=probs)
344+
self._np_random.choice(self.env.action_space.n, size=1, p=probs)
344345
) # random
345346
if old_action != action:
346347
# print("NOISE inserted", old_action, action)
347348
self.total_noisy_transitions_episode += 1
348349
else: # cont. envs
349350
if self.transition_noise is not None:
350351
noise_in_transition = (
351-
self.transition_noise(self.np_random)
352+
self.transition_noise(self._np_random)
352353
if self.transition_noise
353354
else 0
354355
) # #random
@@ -400,7 +401,7 @@ def step(self, action):
400401
# action and time_step as well. Would need to change implementation to
401402
# have a queue for the rewards achieved and then pick the reward that was
402403
# generated delay timesteps ago.
403-
noise_in_reward = self.reward_noise(self.np_random) if self.reward_noise else 0
404+
noise_in_reward = self.reward_noise(self._np_random) if self.reward_noise else 0
404405
self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
405406
self.total_reward_episode += reward
406407
reward += noise_in_reward
@@ -409,7 +410,11 @@ def step(self, action):
409410

410411
return next_state, reward, done, trunc, info
411412

412-
def reset(self):
413+
def reset(self, seed=None):
414+
'''
415+
From Gymnasium v26, the reset method has a seed parameter.
416+
'''
417+
413418
# on episode "end" stuff (to not be invoked when reset() called when
414419
# self.total_episodes = 0; end is in quotes because it may not be a true
415420
# episode end reached by reaching a terminal state, but reset() may have
@@ -445,18 +450,18 @@ def reset(self):
445450

446451
if "irrelevant_features" in self.config:
447452
if self.config["state_space_type"] == "discrete":
448-
reset_state = self.env.reset()[0]
449-
reset_state_irr = self.irr_toy_env.reset()[0]
450-
reset_state = tuple([reset_state, reset_state_irr])
453+
reset_state, reset_state_info = self.env.reset(seed=seed)
454+
reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
455+
reset_state = tuple([reset_state, reset_state_irr]), tuple([reset_state_info, reset_state_irr_info])
451456
else:
452-
reset_state = self.env.reset()[0]
453-
reset_state_irr = self.irr_toy_env.reset()[0]
454-
reset_state = np.concatenate((reset_state, reset_state_irr))
457+
reset_state, reset_state_info = self.env.reset(seed=seed)
458+
reset_state_irr, reset_state_irr_info = self.irr_toy_env.reset(seed=seed)
459+
reset_state = np.concatenate((reset_state, reset_state_irr)), tuple([reset_state_info, reset_state_irr_info])
455460
else:
456-
reset_state = self.env.reset()[0]
461+
reset_state = self.env.reset(seed=seed)
457462

458463
if self.image_transforms:
459-
reset_state = self.get_transformed_image(reset_state)
464+
reset_state = (self.get_transformed_image(reset_state[0]), reset_state[1])
460465

461466
return reset_state
462467
# return super(GymEnvWrapper, self).reset()
@@ -467,15 +472,15 @@ def seed(self, seed=None):
467472
Parameters
468473
----------
469474
seed : int
470-
seed to initialise the np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
475+
seed to initialise the _np_random instance held by the environment. Cannot use numpy.int64 or similar because Gym doesn't accept it.
471476
472477
Returns
473478
-------
474479
int
475480
The seed returned by Gym
476481
"""
477482
# If seed is None, you get a randomly generated seed from gymnasium.utils...
478-
self.np_random, self.seed_ = gym.utils.seeding.np_random(seed) # random
483+
self._np_random, self.seed_ = gym.utils.seeding.np_random(seed) # random
479484
print(
480485
"Env SEED set to: "
481486
+ str(seed)
@@ -540,16 +545,16 @@ def get_transformed_image(self, env_img):
540545
# + str(min_R)
541546
# )
542547
# min_R = np.log(min_R)
543-
# log_sample = min_R + self.np_random.random() * (max_R - min_R)
548+
# log_sample = min_R + self._np_random.random() * (max_R - min_R)
544549
# sample_ = np.exp(log_sample)
545550
# R = int(sample_)
546551
# # print("R", min_R, max_R)
547552
#
548553
if "shift" in self.image_transforms:
549554
max_shift_w = (tot_width - R) // 2
550555
max_shift_h = (tot_height - R) // 2
551-
add_shift_w = self.np_random.integers(-max_shift_w + 1, max_shift_w).item()
552-
add_shift_h = self.np_random.integers(-max_shift_h + 1, max_shift_h).item()
556+
add_shift_w = self._np_random.integers(-max_shift_w + 1, max_shift_w).item()
557+
add_shift_h = self._np_random.integers(-max_shift_h + 1, max_shift_h).item()
553558
# print("add_shift_w, add_shift_h", add_shift_w, add_shift_h)
554559
add_shift_w = int(add_shift_w / sh_quant) * sh_quant
555560
add_shift_h = int(add_shift_h / sh_quant) * sh_quant

‎mdp_playground/envs/mujoco_env_wrapper.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,20 @@
11
# from gymnasium.envs.mujoco.mujoco_env import MujocoEnv
2-
from gymnasium.envs.mujoco.half_cheetah_v3 import HalfCheetahEnv
3-
from gymnasium.envs.mujoco.pusher import PusherEnv
4-
from gymnasium.envs.mujoco.reacher import ReacherEnv
2+
from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
3+
from gymnasium.envs.mujoco.pusher_v4 import PusherEnv
4+
from gymnasium.envs.mujoco.reacher_v4 import ReacherEnv
55
import copy
66

77

88
def get_mujoco_wrapper(base_class):
9-
"""Wraps a mujoco-py environment to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
9+
"""Wraps a mujoco environment, by subclassing it, to be able to modify its low-level Mujoco XML attributes and inject the dimensions of MDP Playground. Please see [`example.py`](example.py) for some simple examples of how to use this class. The values for these dimensions are passed in a config dict as for mdp_playground.envs.RLToyEnv. The description for the supported dimensions below can be found in mdp_playground/envs/rl_toy_env.py.
1010
1111
Currently supported dimensions:
1212
time_unit
1313
action_space_max
1414
1515
For both of these dimensions, the scalar value passed in the dict is used to multiply the base environments' values.
1616
17-
For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying mujoco-py class to calculate rewards in th e environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
17+
For the Mujoco environments, the time_unit is achieved by multiplying the Gym Mujoco environments's frame_skip and thus needs to be such that time_unit * frame_skip is an integer. The time_unit is NOT achieved by changing Mujoco's timestep because that would change the numerical integration done by Mujoco and thus the objective of the environment. The _ctrl_cost_weight and _forward_reward_weight used by the underlying MujocoEnv class to calculate rewards in the environment are proportionally multiplied by the time_unit, so that the rewards are on the same scale across different time_units on average.
1818
1919
Similarly for the action_space_max (which controls the action range), the new action range is achieved by multiplying the Gym Mujoco environments's action_max and action_min by the action_space_max passed in the dict.
2020
@@ -102,6 +102,12 @@ def __init__(self, **config): # Gets passed env_config from run_experiments.py
102102
self._forward_reward_weight,
103103
"corresponding to time_unit in config.",
104104
)
105+
else:
106+
print("Current mujoco env is not HalfCheetah v4, so only modified frameskip when changing time_unit. "\
107+
"Not changing the _ctrl_cost_weight or _forward_reward_weight. It may make sense to also modify "\
108+
"these variables depending on their relation with the time_unit. You will need to look deeper into "\
109+
"how the reward function is defined to know if this is needed.")
110+
105111

106112
def step(self, action): # hack
107113
obs, reward, done, trunc, info = super(MujocoEnvWrapper, self).step(action)

0 commit comments

Comments
 (0)