Skip to content

Commit 2ad8b9f

Browse files
author
chenyingyinglalala
committed
add
0 parents  commit 2ad8b9f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+5559
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import numpy as np
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import time
5+
6+
ALPHA = 0.1
7+
GAMMA = 0.95
8+
EPSILION = 0.9
9+
N_STATE = 20
10+
ACTIONS = ['left', 'right']
11+
MAX_EPISODES = 200
12+
FRESH_TIME = 0.1
13+
14+
def build_q_table(n_state, actions):
15+
q_table = pd.DataFrame(
16+
np.zeros((n_state, len(actions))),
17+
np.arange(n_state),
18+
actions
19+
)
20+
return q_table
21+
22+
def choose_action(state, q_table):
23+
#epslion - greedy policy
24+
state_action = q_table.loc[state,:]
25+
if np.random.uniform()>EPSILION or (state_action==0).all():
26+
action_name = np.random.choice(ACTIONS)
27+
else:
28+
action_name = state_action.idxmax()
29+
return action_name
30+
31+
def get_env_feedback(state, action):
32+
if action=='right':
33+
if state == N_STATE-2:
34+
next_state = 'terminal'
35+
reward = 1
36+
else:
37+
next_state = state+1
38+
reward = -0.5
39+
else:
40+
if state == 0:
41+
next_state = 0
42+
43+
else:
44+
next_state = state-1
45+
reward = -0.5
46+
return next_state, reward
47+
48+
def update_env(state,episode, step_counter):
49+
env = ['-'] *(N_STATE-1)+['T']
50+
if state =='terminal':
51+
print("Episode {}, the total step is {}".format(episode+1, step_counter))
52+
final_env = ['-'] *(N_STATE-1)+['T']
53+
return True, step_counter
54+
else:
55+
env[state]='*'
56+
env = ''.join(env)
57+
print(env)
58+
time.sleep(FRESH_TIME)
59+
return False, step_counter
60+
61+
62+
def q_learning():
63+
q_table = build_q_table(N_STATE, ACTIONS)
64+
step_counter_times = []
65+
for episode in range(MAX_EPISODES):
66+
state = 0
67+
is_terminal = False
68+
step_counter = 0
69+
update_env(state, episode, step_counter)
70+
while not is_terminal:
71+
action = choose_action(state,q_table)
72+
next_state, reward = get_env_feedback(state, action)
73+
next_q = q_table.loc[state, action]
74+
if next_state == 'terminal':
75+
is_terminal = True
76+
q_target = reward
77+
else:
78+
delta = reward + GAMMA*q_table.iloc[next_state,:].max()-q_table.loc[state, action]
79+
q_table.loc[state, action] += ALPHA*delta
80+
state = next_state
81+
is_terminal,steps = update_env(state, episode, step_counter+1)
82+
step_counter+=1
83+
if is_terminal:
84+
step_counter_times.append(steps)
85+
86+
return q_table, step_counter_times
87+
88+
def main():
89+
q_table, step_counter_times= q_learning()
90+
print("Q table\n{}\n".format(q_table))
91+
print('end')
92+
93+
plt.plot(step_counter_times,'g-')
94+
plt.ylabel("steps")
95+
plt.show()
96+
print("The step_counter_times is {}".format(step_counter_times))
97+
98+
main()
+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
2+
import numpy as np
3+
import pandas as pd
4+
import matplotlib.pyplot as plt
5+
import time
6+
7+
ALPHA = 0.1
8+
GAMMA = 0.95
9+
EPSILION = 0.9
10+
N_STATE = 6
11+
ACTIONS = ['left', 'right']
12+
MAX_EPISODES = 200
13+
FRESH_TIME = 0.1
14+
15+
def build_q_table(n_state, actions):
16+
q_table = pd.DataFrame(
17+
np.zeros((n_state, len(actions))),
18+
np.arange(n_state),
19+
actions
20+
)
21+
return q_table
22+
23+
def choose_action(state, q_table):
24+
#epslion - greedy policy
25+
state_action = q_table.loc[state,:]
26+
if np.random.uniform()>EPSILION or (state_action==0).all():
27+
action_name = np.random.choice(ACTIONS)
28+
else:
29+
action_name = state_action.idxmax()
30+
return action_name
31+
32+
def get_env_feedback(state, action):
33+
if action=='right':
34+
if state == N_STATE-2:
35+
next_state = 'terminal'
36+
reward = 1
37+
else:
38+
next_state = state+1
39+
reward = -0.5
40+
else:
41+
if state == 0:
42+
next_state = 0
43+
44+
else:
45+
next_state = state-1
46+
reward = -0.5
47+
return next_state, reward
48+
49+
def update_env(state,episode, step_counter):
50+
env = ['-'] *(N_STATE-1)+['T']
51+
if state =='terminal':
52+
print("Episode {}, the total step is {}".format(episode+1, step_counter))
53+
final_env = ['-'] *(N_STATE-1)+['T']
54+
return True, step_counter
55+
else:
56+
env[state]='*'
57+
env = ''.join(env)
58+
print(env)
59+
time.sleep(FRESH_TIME)
60+
return False, step_counter
61+
62+
63+
def sarsa_learning():
64+
q_table = build_q_table(N_STATE, ACTIONS)
65+
step_counter_times = []
66+
for episode in range(MAX_EPISODES):
67+
state = 0
68+
is_terminal = False
69+
step_counter = 0
70+
update_env(state, episode, step_counter)
71+
while not is_terminal:
72+
action = choose_action(state,q_table)
73+
next_state, reward = get_env_feedback(state, action)
74+
if next_state != 'terminal':
75+
next_action = choose_action(next_state, q_table) #sarsa update method
76+
else:
77+
next_action = action
78+
next_q = q_table.loc[state, action]
79+
80+
if next_state == 'terminal':
81+
is_terminal = True
82+
q_target = reward
83+
else:
84+
delta = reward + GAMMA*q_table.loc[next_state,next_action]-q_table.loc[state, action]
85+
q_table.loc[state, action] += ALPHA*delta
86+
state = next_state
87+
is_terminal,steps = update_env(state, episode, step_counter+1)
88+
step_counter+=1
89+
if is_terminal:
90+
step_counter_times.append(steps)
91+
92+
return q_table, step_counter_times
93+
94+
def main():
95+
q_table, step_counter_times= sarsa_learning()
96+
print("Q table\n{}\n".format(q_table))
97+
print('end')
98+
99+
plt.plot(step_counter_times,'g-')
100+
plt.ylabel("steps")
101+
plt.show()
102+
print("The step_counter_times is {}".format(step_counter_times))
103+
104+
main()
+152
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
2+
import numpy as np
3+
4+
class GridWorld:
5+
6+
def __init__(self, tot_row, tot_col):
7+
self.action_space_size = 4
8+
self.world_row = tot_row
9+
self.world_col = tot_col
10+
#The world is a matrix of size row x col x 2
11+
#The first layer contains the obstacles
12+
#The second layer contains the rewards
13+
#self.world_matrix = np.zeros((tot_row, tot_col, 2))
14+
self.transition_matrix = np.ones((self.action_space_size, self.action_space_size))/ self.action_space_size
15+
#self.transition_array = np.ones(self.action_space_size) / self.action_space_size
16+
self.reward_matrix = np.zeros((tot_row, tot_col))
17+
self.state_matrix = np.zeros((tot_row, tot_col))
18+
self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]
19+
20+
#def setTransitionArray(self, transition_array):
21+
#if(transition_array.shape != self.transition_array):
22+
#raise ValueError('The shape of the two matrices must be the same.')
23+
#self.transition_array = transition_array
24+
25+
def setTransitionMatrix(self, transition_matrix):
26+
'''Set the reward matrix.
27+
28+
The transition matrix here is intended as a matrix which has a line
29+
for each action and the element of the row are the probabilities to
30+
executes each action when a command is given. For example:
31+
[[0.55, 0.25, 0.10, 0.10]
32+
[0.25, 0.25, 0.25, 0.25]
33+
[0.30, 0.20, 0.40, 0.10]
34+
[0.10, 0.20, 0.10, 0.60]]
35+
36+
This matrix defines the transition rules for all the 4 possible actions.
37+
The first row corresponds to the probabilities of executing each one of
38+
the 4 actions when the policy orders to the robot to go UP. In this case
39+
the transition model says that with a probability of 0.55 the robot will
40+
go UP, with a probaiblity of 0.25 RIGHT, 0.10 DOWN and 0.10 LEFT.
41+
'''
42+
if(transition_matrix.shape != self.transition_matrix.shape):
43+
raise ValueError('The shape of the two matrices must be the same.')
44+
self.transition_matrix = transition_matrix
45+
46+
def setRewardMatrix(self, reward_matrix):
47+
'''Set the reward matrix.
48+
49+
'''
50+
if(reward_matrix.shape != self.reward_matrix.shape):
51+
raise ValueError('The shape of the matrix does not match with the shape of the world.')
52+
self.reward_matrix = reward_matrix
53+
54+
def setStateMatrix(self, state_matrix):
55+
'''Set the obstacles in the world.
56+
57+
The input to the function is a matrix with the
58+
same size of the world
59+
-1 for states which are not walkable.
60+
+1 for terminal states
61+
0 for all the walkable states (non terminal)
62+
The following matrix represents the 4x3 world
63+
used in the series "dissecting reinforcement learning"
64+
[[0, 0, 0, +1]
65+
[0, -1, 0, +1]
66+
[0, 0, 0, 0]]
67+
'''
68+
if(state_matrix.shape != self.state_matrix.shape):
69+
raise ValueError('The shape of the matrix does not match with the shape of the world.')
70+
self.state_matrix = state_matrix
71+
72+
def setPosition(self, index_row=None, index_col=None):
73+
''' Set the position of the robot in a specific state.
74+
75+
'''
76+
if(index_row is None or index_col is None): self.position = [np.random.randint(tot_row), np.random.randint(tot_col)]
77+
else: self.position = [index_row, index_col]
78+
79+
def render(self):
80+
''' Print the current world in the terminal.
81+
82+
O represents the robot position
83+
- respresent empty states.
84+
# represents obstacles
85+
* represents terminal states
86+
'''
87+
graph = ""
88+
for row in range(self.world_row):
89+
row_string = ""
90+
for col in range(self.world_col):
91+
if(self.position == [row, col]): row_string += u" \u25CB " # u" \u25CC "
92+
else:
93+
if(self.state_matrix[row, col] == 0): row_string += ' - '
94+
elif(self.state_matrix[row, col] == -1): row_string += ' # '
95+
elif(self.state_matrix[row, col] == +1): row_string += ' * '
96+
row_string += '\n'
97+
graph += row_string
98+
print(graph)
99+
100+
def reset(self, exploring_starts=False):
101+
''' Set the position of the robot in the bottom left corner.
102+
103+
It returns the first observation
104+
'''
105+
if exploring_starts:
106+
while(True):
107+
row = np.random.randint(0, self.world_row)
108+
col = np.random.randint(0, self.world_col)
109+
if(self.state_matrix[row, col] == 0): break
110+
self.position = [row, col]
111+
else:
112+
self.position = [self.world_row-1, 0]
113+
#reward = self.reward_matrix[self.position[0], self.position[1]]
114+
return self.position
115+
116+
def step(self, action):
117+
''' One step in the world.
118+
119+
[observation, reward, done = env.step(action)]
120+
The robot moves one step in the world based on the action given.
121+
The action can be 0=UP, 1=RIGHT, 2=DOWN, 3=LEFT
122+
@return observation the position of the robot after the step
123+
@return reward the reward associated with the next state
124+
@return done True if the state is terminal
125+
'''
126+
if(action >= self.action_space_size):
127+
raise ValueError('The action is not included in the action space.')
128+
129+
#Based on the current action and the probability derived
130+
#from the trasition model it chooses a new actio to perform
131+
action = np.random.choice(4, 1, p=self.transition_matrix[int(action),:])
132+
#action = self.transition_model(action)
133+
134+
#Generating a new position based on the current position and action
135+
if(action == 0): new_position = [self.position[0]-1, self.position[1]] #UP
136+
elif(action == 1): new_position = [self.position[0], self.position[1]+1] #RIGHT
137+
elif(action == 2): new_position = [self.position[0]+1, self.position[1]] #DOWN
138+
elif(action == 3): new_position = [self.position[0], self.position[1]-1] #LEFT
139+
else: raise ValueError('The action is not included in the action space.')
140+
141+
#Check if the new position is a valid position
142+
#print(self.state_matrix)
143+
if (new_position[0]>=0 and new_position[0]<self.world_row):
144+
if(new_position[1]>=0 and new_position[1]<self.world_col):
145+
if(self.state_matrix[new_position[0], new_position[1]] != -1):
146+
self.position = new_position
147+
148+
reward = self.reward_matrix[self.position[0], self.position[1]]
149+
#Done is True if the state is a terminal state
150+
done = bool(self.state_matrix[self.position[0], self.position[1]])
151+
return self.position, reward, done
152+

0 commit comments

Comments
 (0)