|
| 1 | + |
| 2 | +import numpy as np |
| 3 | + |
| 4 | +class GridWorld: |
| 5 | + |
| 6 | + def __init__(self, tot_row, tot_col): |
| 7 | + self.action_space_size = 4 |
| 8 | + self.world_row = tot_row |
| 9 | + self.world_col = tot_col |
| 10 | + #The world is a matrix of size row x col x 2 |
| 11 | + #The first layer contains the obstacles |
| 12 | + #The second layer contains the rewards |
| 13 | + #self.world_matrix = np.zeros((tot_row, tot_col, 2)) |
| 14 | + self.transition_matrix = np.ones((self.action_space_size, self.action_space_size))/ self.action_space_size |
| 15 | + #self.transition_array = np.ones(self.action_space_size) / self.action_space_size |
| 16 | + self.reward_matrix = np.zeros((tot_row, tot_col)) |
| 17 | + self.state_matrix = np.zeros((tot_row, tot_col)) |
| 18 | + self.position = [np.random.randint(tot_row), np.random.randint(tot_col)] |
| 19 | + |
| 20 | + #def setTransitionArray(self, transition_array): |
| 21 | + #if(transition_array.shape != self.transition_array): |
| 22 | + #raise ValueError('The shape of the two matrices must be the same.') |
| 23 | + #self.transition_array = transition_array |
| 24 | + |
| 25 | + def setTransitionMatrix(self, transition_matrix): |
| 26 | + '''Set the reward matrix. |
| 27 | +
|
| 28 | + The transition matrix here is intended as a matrix which has a line |
| 29 | + for each action and the element of the row are the probabilities to |
| 30 | + executes each action when a command is given. For example: |
| 31 | + [[0.55, 0.25, 0.10, 0.10] |
| 32 | + [0.25, 0.25, 0.25, 0.25] |
| 33 | + [0.30, 0.20, 0.40, 0.10] |
| 34 | + [0.10, 0.20, 0.10, 0.60]] |
| 35 | +
|
| 36 | + This matrix defines the transition rules for all the 4 possible actions. |
| 37 | + The first row corresponds to the probabilities of executing each one of |
| 38 | + the 4 actions when the policy orders to the robot to go UP. In this case |
| 39 | + the transition model says that with a probability of 0.55 the robot will |
| 40 | + go UP, with a probaiblity of 0.25 RIGHT, 0.10 DOWN and 0.10 LEFT. |
| 41 | + ''' |
| 42 | + if(transition_matrix.shape != self.transition_matrix.shape): |
| 43 | + raise ValueError('The shape of the two matrices must be the same.') |
| 44 | + self.transition_matrix = transition_matrix |
| 45 | + |
| 46 | + def setRewardMatrix(self, reward_matrix): |
| 47 | + '''Set the reward matrix. |
| 48 | +
|
| 49 | + ''' |
| 50 | + if(reward_matrix.shape != self.reward_matrix.shape): |
| 51 | + raise ValueError('The shape of the matrix does not match with the shape of the world.') |
| 52 | + self.reward_matrix = reward_matrix |
| 53 | + |
| 54 | + def setStateMatrix(self, state_matrix): |
| 55 | + '''Set the obstacles in the world. |
| 56 | +
|
| 57 | + The input to the function is a matrix with the |
| 58 | + same size of the world |
| 59 | + -1 for states which are not walkable. |
| 60 | + +1 for terminal states |
| 61 | + 0 for all the walkable states (non terminal) |
| 62 | + The following matrix represents the 4x3 world |
| 63 | + used in the series "dissecting reinforcement learning" |
| 64 | + [[0, 0, 0, +1] |
| 65 | + [0, -1, 0, +1] |
| 66 | + [0, 0, 0, 0]] |
| 67 | + ''' |
| 68 | + if(state_matrix.shape != self.state_matrix.shape): |
| 69 | + raise ValueError('The shape of the matrix does not match with the shape of the world.') |
| 70 | + self.state_matrix = state_matrix |
| 71 | + |
| 72 | + def setPosition(self, index_row=None, index_col=None): |
| 73 | + ''' Set the position of the robot in a specific state. |
| 74 | +
|
| 75 | + ''' |
| 76 | + if(index_row is None or index_col is None): self.position = [np.random.randint(tot_row), np.random.randint(tot_col)] |
| 77 | + else: self.position = [index_row, index_col] |
| 78 | + |
| 79 | + def render(self): |
| 80 | + ''' Print the current world in the terminal. |
| 81 | +
|
| 82 | + O represents the robot position |
| 83 | + - respresent empty states. |
| 84 | + # represents obstacles |
| 85 | + * represents terminal states |
| 86 | + ''' |
| 87 | + graph = "" |
| 88 | + for row in range(self.world_row): |
| 89 | + row_string = "" |
| 90 | + for col in range(self.world_col): |
| 91 | + if(self.position == [row, col]): row_string += u" \u25CB " # u" \u25CC " |
| 92 | + else: |
| 93 | + if(self.state_matrix[row, col] == 0): row_string += ' - ' |
| 94 | + elif(self.state_matrix[row, col] == -1): row_string += ' # ' |
| 95 | + elif(self.state_matrix[row, col] == +1): row_string += ' * ' |
| 96 | + row_string += '\n' |
| 97 | + graph += row_string |
| 98 | + print(graph) |
| 99 | + |
| 100 | + def reset(self, exploring_starts=False): |
| 101 | + ''' Set the position of the robot in the bottom left corner. |
| 102 | +
|
| 103 | + It returns the first observation |
| 104 | + ''' |
| 105 | + if exploring_starts: |
| 106 | + while(True): |
| 107 | + row = np.random.randint(0, self.world_row) |
| 108 | + col = np.random.randint(0, self.world_col) |
| 109 | + if(self.state_matrix[row, col] == 0): break |
| 110 | + self.position = [row, col] |
| 111 | + else: |
| 112 | + self.position = [self.world_row-1, 0] |
| 113 | + #reward = self.reward_matrix[self.position[0], self.position[1]] |
| 114 | + return self.position |
| 115 | + |
| 116 | + def step(self, action): |
| 117 | + ''' One step in the world. |
| 118 | +
|
| 119 | + [observation, reward, done = env.step(action)] |
| 120 | + The robot moves one step in the world based on the action given. |
| 121 | + The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT |
| 122 | + @return observation the position of the robot after the step |
| 123 | + @return reward the reward associated with the next state |
| 124 | + @return done True if the state is terminal |
| 125 | + ''' |
| 126 | + if(action >= self.action_space_size): |
| 127 | + raise ValueError('The action is not included in the action space.') |
| 128 | + |
| 129 | + #Based on the current action and the probability derived |
| 130 | + #from the trasition model it chooses a new actio to perform |
| 131 | + action = np.random.choice(4, 1, p=self.transition_matrix[int(action),:]) |
| 132 | + #action = self.transition_model(action) |
| 133 | + |
| 134 | + #Generating a new position based on the current position and action |
| 135 | + if(action == 0): new_position = [self.position[0]-1, self.position[1]] #UP |
| 136 | + elif(action == 1): new_position = [self.position[0], self.position[1]+1] #RIGHT |
| 137 | + elif(action == 2): new_position = [self.position[0]+1, self.position[1]] #DOWN |
| 138 | + elif(action == 3): new_position = [self.position[0], self.position[1]-1] #LEFT |
| 139 | + else: raise ValueError('The action is not included in the action space.') |
| 140 | + |
| 141 | + #Check if the new position is a valid position |
| 142 | + #print(self.state_matrix) |
| 143 | + if (new_position[0]>=0 and new_position[0]<self.world_row): |
| 144 | + if(new_position[1]>=0 and new_position[1]<self.world_col): |
| 145 | + if(self.state_matrix[new_position[0], new_position[1]] != -1): |
| 146 | + self.position = new_position |
| 147 | + |
| 148 | + reward = self.reward_matrix[self.position[0], self.position[1]] |
| 149 | + #Done is True if the state is a terminal state |
| 150 | + done = bool(self.state_matrix[self.position[0], self.position[1]]) |
| 151 | + return self.position, reward, done |
| 152 | + |
0 commit comments