Skip to content

Commit febad86

Browse files
Fixed many of the failing tests; fixed terminal state testing for grid envs
1 parent 4ee76c1 commit febad86

File tree

3 files changed

+279
-186
lines changed

3 files changed

+279
-186
lines changed

‎example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -570,7 +570,7 @@ def procgen_wrapper_example():
570570
mujoco_wrapper_example()
571571

572572
print(set_ansi_escape + "\nRunning MiniGrid wrapper example:\n" + reset_ansi_escape)
573-
minigrid_wrapper_example()
573+
# minigrid_wrapper_example()
574574

575575
# print(set_ansi_escape + "\nRunning ProcGen wrapper example:\n" + reset_ansi_escape)
576576
# procgen_wrapper_example()

‎mdp_playground/envs/rl_toy_env.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class RLToyEnv(gym.Env):
5353
diameter : int > 0
5454
For discrete environments, if diameter = d, the set of states is set to be a d-partite graph (and NOT a complete d-partite graph), where, if we order the d sets as 1, 2, .., d, states from set 1 will have actions leading to states in set 2 and so on, with the final set d having actions leading to states in set 1. Number of actions for each state will, thus, be = (number of states) / (d). Default value: 1 for discrete environments. For continuous environments, this dimension is set automatically based on the state_space_max value.
5555
terminal_state_density : float in range [0, 1]
56-
For discrete environments, the fraction of states that are terminal; the terminal states are fixed to the "last" states when we consider them to be ordered by their numerical value. This is w.l.o.g. because discrete states are categorical. For continuous environments, please see terminal_states and term_state_edge for how to control terminal states. Default value: 0.25.
56+
For discrete environments, the fraction of states that are terminal; the terminal states are fixed to the "last" states when we consider them to be ordered by their numerical value. This is w.l.o.g. because discrete states are categorical. For continuous environments, please see terminal_states and term_state_edge for how to control terminal states. For grid environments, please see terminal_states only. Default value: 0.25.
5757
term_state_reward : float
5858
Adds this to the reward if a terminal state was reached at the current time step. Default value: 0.
5959
image_representations : boolean
@@ -134,7 +134,7 @@ class RLToyEnv(gym.Env):
134134
target_point : numpy.ndarray
135135
The target point in case move_to_a_point is the reward_function. If make_denser is false, reward is only handed out when the target point is reached.
136136
terminal_states : Python function(state) or 1-D numpy.ndarray
137-
Same description as for terminal_states under discrete envs
137+
Same description as for terminal_states under discrete envs, except that the state is a grid state, e.g., a list of [x, y] coordinates for a 2-D grid.
138138
139139
Other important config:
140140
Specific to discrete environments:
@@ -253,6 +253,8 @@ def __init__(self, **config):
253253
# sh = logging.StreamHandler()
254254
# sh.setFormatter(fmt=fmtr)
255255
self.logger = logging.getLogger(__name__)
256+
# print("Logging stuff:", self.logger, self.logger.handlers, __name__)
257+
# Example output of above: <Logger mdp_playground.envs.rl_toy_env (INFO)> [] mdp_playground.envs.rl_toy_env
256258
# self.logger.addHandler(sh)
257259

258260
if "log_filename" in config:
@@ -516,6 +518,7 @@ def __init__(self, **config):
516518
elif config["state_space_type"] == "grid":
517519
assert "grid_shape" in config
518520
self.grid_shape = config["grid_shape"]
521+
self.grid_np_data_type = np.int64
519522
else:
520523
raise ValueError("Unknown state_space_type")
521524

@@ -678,7 +681,7 @@ def __init__(self, **config):
678681
) # #seed
679682
else:
680683
self.action_space = self.action_spaces[0]
681-
else:
684+
else: # not image_representations for discrete env
682685
if self.irrelevant_features:
683686
self.observation_space = TupleExtended(
684687
self.observation_spaces, seed=self.seed_dict["state_space"]
@@ -919,7 +922,7 @@ def init_terminal_states(self):
919922
highs = term_state # #hardcoded
920923
self.term_spaces.append(
921924
BoxExtended(
922-
low=lows, high=highs, seed=self.seed_, dtype=np.int64
925+
low=lows, high=highs, seed=self.seed_, dtype=self.grid_np_data_type
923926
)
924927
) # #seed #hack #TODO
925928

@@ -1098,7 +1101,7 @@ def init_transition_function(self):
10981101
# meaningful even if someone doesn't check for
10991102
# 'done' being = True
11001103

1101-
# #irrelevant dimensions part
1104+
# #irrelevant dimensions part for discrete env
11021105
if self.irrelevant_features: # #test
11031106
self.config["transition_function_irrelevant"] = np.zeros(
11041107
shape=(self.state_space_size[1], self.action_space_size[1]),
@@ -1617,10 +1620,13 @@ def transition_function(self, state, action):
16171620
)
16181621
# if "transition_noise" in self.config:
16191622
noise_in_transition = (
1620-
self.transition_noise(self.np_random) if self.transition_noise else 0
1623+
self.transition_noise(self.np_random) if self.transition_noise else
1624+
np.zeros(self.state_space_dim)
16211625
) # #random
16221626
self.total_abs_noise_in_transition_episode += np.abs(noise_in_transition)
16231627
next_state += noise_in_transition # ##IMP Noise is only applied to
1628+
# Store the noise in transition for easier testing
1629+
self.noise_in_transition = noise_in_transition
16241630
# state and not to higher order derivatives
16251631
# TODO Check if next_state is within state space bounds
16261632
if not self.observation_space.contains(next_state):
@@ -1660,7 +1666,7 @@ def transition_function(self, state, action):
16601666
# Need to check that dtype is int because Gym doesn't
16611667
if (
16621668
self.action_space.contains(action)
1663-
and np.array(action).dtype == np.int64
1669+
and np.array(action).dtype == self.grid_np_data_type
16641670
):
16651671
if self.transition_noise:
16661672
# self.np_random.choice only works for 1-D arrays
@@ -1675,6 +1681,7 @@ def transition_function(self, state, action):
16751681
)
16761682
# print(str(action) + str(new_action))
16771683
self.total_noisy_transitions_episode += 1
1684+
# print("action, new_action", action, new_action)
16781685
action = new_action
16791686
break
16801687

@@ -1698,7 +1705,11 @@ def transition_function(self, state, action):
16981705
)
16991706

17001707
if self.config["reward_function"] == "move_to_a_point":
1701-
if self.target_point == next_state:
1708+
if "irrelevant_features" in self.config and self.config["irrelevant_features"]:
1709+
next_state_rel = next_state[:len(self.grid_shape) // 2]
1710+
else:
1711+
next_state_rel = next_state
1712+
if self.target_point == next_state_rel:
17021713
self.reached_terminal = True
17031714

17041715
next_state = np.array(next_state)
@@ -1769,7 +1780,6 @@ def reward_function(self, state, action):
17691780
sub_seq = tuple(
17701781
state_considered[1 + delay : self.augmented_state_length]
17711782
)
1772-
# print(state_considered, "with delay", self.delay, "rewarded with:", 1)
17731783
if sub_seq in self.rewardable_sequences:
17741784
reward = self.rewardable_sequences[sub_seq]
17751785
# print(state_considered, "with delay", self.delay, "rewarded with:", reward)
@@ -1803,7 +1813,13 @@ def reward_function(self, state, action):
18031813
else:
18041814
if self.config["reward_function"] == "move_along_a_line":
18051815
# print("######reward test", self.total_transitions_episode, np.array(self.augmented_state), np.array(self.augmented_state).shape)
1806-
# #test: 1. for checking 0 distance for same action being always applied; 2. similar to 1. but for different dynamics orders; 3. similar to 1 but for different action_space_dims; 4. for a known applied action case, check manually the results of the formulae and see that programmatic results match: should also have a unit version of 4. for dist_of_pt_from_line() and an integration version here for total_deviation calc.?.
1816+
# #test: 1. for checking 0 distance for same action being always applied;
1817+
# 2. similar to 1. but for different dynamics orders;
1818+
# 3. similar to 1 but for different action_space_dims;
1819+
# 4. for a known applied action case, check manually the results
1820+
# of the formulae and see that programmatic results match: should
1821+
# also have a unit version of 4. for dist_of_pt_from_line() and
1822+
# an integration version here for total_deviation calc.?.
18071823
data_ = np.array(state_considered, dtype=self.dtype)[
18081824
1 + delay : self.augmented_state_length,
18091825
self.config["relevant_indices"],
@@ -1818,9 +1834,9 @@ def reward_function(self, state, action):
18181834
)
18191835
line_end_pts = (
18201836
vv[0] * np.linspace(-1, 1, 2)[:, np.newaxis]
1821-
) # vv[0] = 1st
1822-
# eigenvector, corres. to Principal Component #hardcoded -100
1823-
# to 100 to get a "long" line which should make calculations more
1837+
)
1838+
# vv[0] = 1st eigenvector, corres. to Principal Component #hardcoded -100
1839+
# to 100 initially to get a "long" line which should make calculations more
18241840
# robust(?: didn't seem to be the case for 1st few trials, so changed it
18251841
# to -1, 1; even tried up to 10000 - seems to get less precise for larger
18261842
# numbers) to numerical issues in dist_of_pt_from_line() below; newaxis
@@ -1911,6 +1927,7 @@ def reward_function(self, state, action):
19111927
# #random ###TODO Would be better to parameterise this in terms of state, action and time_step as well. Would need to change implementation to have a queue for the rewards achieved and then pick the reward that was generated delay timesteps ago.
19121928
self.total_abs_noise_in_reward_episode += np.abs(noise_in_reward)
19131929
self.total_reward_episode += reward
1930+
self.logger.info("Reward: " + str(reward) + " Noise in reward: " + str(noise_in_reward))
19141931
reward += noise_in_reward
19151932
reward *= self.reward_scale
19161933
reward += self.reward_shift
@@ -2266,7 +2283,8 @@ def seed(self, seed=None):
22662283

22672284

22682285
def dist_of_pt_from_line(pt, ptA, ptB):
2269-
"""Returns shortest distance of a point from a line defined by 2 points - ptA and ptB. Based on: https://softwareengineering.stackexchange.com/questions/168572/distance-from-point-to-n-dimensional-line"""
2286+
"""Returns shortest distance of a point from a line defined by 2 points - ptA and ptB.
2287+
Based on: https://softwareengineering.stackexchange.com/questions/168572/distance-from-point-to-n-dimensional-line"""
22702288

22712289
tolerance = 1e-13
22722290
lineAB = ptA - ptB
@@ -2278,10 +2296,13 @@ def dist_of_pt_from_line(pt, ptA, ptB):
22782296
proj = dot_product / np.linalg.norm(
22792297
lineAB
22802298
) # #### TODO could lead to division by zero if line is a null vector!
2299+
# Assuming the above happens when action was nearly 0, we return 0 in the
2300+
# if block above, which is the max reward when one stays in a line in the
2301+
# move_along_a_line case.
22812302
sq_dist = np.linalg.norm(lineApt) ** 2 - proj ** 2
22822303

22832304
if sq_dist < 0:
2284-
if sq_dist < tolerance:
2305+
if sq_dist < -tolerance:
22852306
logging.warning(
22862307
"The squared distance calculated in dist_of_pt_from_line()"
22872308
" using Pythagoras' theorem was less than the tolerance allowed."

0 commit comments

Comments
 (0)