Skip to content

Commit c2da672

Browse files
authored
Merge branch 'master' into patch-1
2 parents 44ef189 + d5847e8 commit c2da672

File tree

6 files changed

+26
-19
lines changed

6 files changed

+26
-19
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
![RL](utilities/RL_image.jpeg) ![PyTorch](utilities/PyTorch-logo-2.jpg)
99

10-
This repository contains PyTorch implementations of deep reinforcement learning algorithms and environments.
10+
This repository contains PyTorch implementations of deep reinforcement learning algorithms and environments. (To help you remember things you learn about RL in general write them in [Save All](https://www.saveall.ai/))
1111

1212
## **Algorithms Implemented**
1313

@@ -110,7 +110,7 @@ conda activate myenvname
110110
111111
pip3 install -r requirements.txt
112112
113-
python Results/Cart_Pole.py
113+
python results/Cart_Pole.py
114114
```
115115

116116
For other games change the last line to one of the other files in the Results folder.
@@ -120,4 +120,4 @@ For other games change the last line to one of the other files in the Results fo
120120
Most Open AI gym environments should work. All you would need to do is change the config.environment field (look at `Results/Cart_Pole.py` for an example of this).
121121

122122
You can also play with your own custom game if you create a separate class that inherits from gym.Env. See `Environments/Four_Rooms_Environment.py`
123-
for an example of a custom environment and then see the script `Results/Four_Rooms.py` to see how to have agents play the environment.
123+
for an example of a custom environment and then see the script `Results/Four_Rooms.py` to see how to have agents play the environment.

agents/DQN_agents/DQN.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class DQN(Base_Agent):
1414
agent_name = "DQN"
1515
def __init__(self, config):
1616
Base_Agent.__init__(self, config)
17-
self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
17+
self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.device)
1818
self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
1919
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
2020
lr=self.hyperparameters["learning_rate"], eps=1e-4)
@@ -112,4 +112,4 @@ def sample_experiences(self):
112112
"""Draws a random sample of experience from the memory buffer"""
113113
experiences = self.memory.sample()
114114
states, actions, rewards, next_states, dones = experiences
115-
return states, actions, rewards, next_states, dones
115+
return states, actions, rewards, next_states, dones

agents/actor_critic_agents/SAC.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,12 @@ def learn(self):
144144
"""Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
145145
state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences()
146146
qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch)
147+
self.update_critic_parameters(qf1_loss, qf2_loss)
148+
147149
policy_loss, log_pi = self.calculate_actor_loss(state_batch)
148150
if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
149151
else: alpha_loss = None
150-
self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss)
152+
self.update_actor_parameters(policy_loss, alpha_loss)
151153

152154
def sample_experiences(self):
153155
return self.memory.sample()
@@ -182,18 +184,21 @@ def calculate_entropy_tuning_loss(self, log_pi):
182184
alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
183185
return alpha_loss
184186

185-
def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss, alpha_loss):
186-
"""Updates the parameters for the actor, both critics and (if specified) the temperature parameter"""
187+
def update_critic_parameters(self, critic_loss_1, critic_loss_2):
188+
"""Updates the parameters for both critics"""
187189
self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1,
188190
self.hyperparameters["Critic"]["gradient_clipping_norm"])
189191
self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
190192
self.hyperparameters["Critic"]["gradient_clipping_norm"])
191-
self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
192-
self.hyperparameters["Actor"]["gradient_clipping_norm"])
193193
self.soft_update_of_target_network(self.critic_local, self.critic_target,
194194
self.hyperparameters["Critic"]["tau"])
195195
self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2,
196196
self.hyperparameters["Critic"]["tau"])
197+
198+
def update_actor_parameters(self, actor_loss, alpha_loss):
199+
"""Updates the parameters for the actor and (if specified) the temperature parameter"""
200+
self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
201+
self.hyperparameters["Actor"]["gradient_clipping_norm"])
197202
if alpha_loss is not None:
198203
self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None)
199204
self.alpha = self.log_alpha.exp()

agents/actor_critic_agents/SAC_Discrete.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def __init__(self, config):
3030
Base_Agent.copy_model_over(self.critic_local, self.critic_target)
3131
Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
3232
self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
33-
self.config.seed)
33+
self.config.seed, device=self.device)
3434

3535
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
3636
self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
@@ -52,7 +52,7 @@ def produce_action_and_action_info(self, state):
5252
"""Given the state, produces an action, the probability of the action, the log probability of the action, and
5353
the argmax action"""
5454
action_probabilities = self.actor_local(state)
55-
max_probability_action = torch.argmax(action_probabilities, dim=-1).unsqueeze(0)
55+
max_probability_action = torch.argmax(action_probabilities, dim=-1)
5656
action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
5757
action = action_distribution.sample().cpu()
5858
# Have to deal with situation of 0.0 probabilities because we can't do log 0
@@ -69,7 +69,7 @@ def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_
6969
qf1_next_target = self.critic_target(next_state_batch)
7070
qf2_next_target = self.critic_target_2(next_state_batch)
7171
min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
72-
min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1)
72+
min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)
7373
next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
7474

7575
qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
@@ -85,7 +85,6 @@ def calculate_actor_loss(self, state_batch):
8585
qf2_pi = self.critic_local_2(state_batch)
8686
min_qf_pi = torch.min(qf1_pi, qf2_pi)
8787
inside_term = self.alpha * log_action_probabilities - min_qf_pi
88-
policy_loss = action_probabilities * inside_term
89-
policy_loss = policy_loss.mean()
88+
policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
9089
log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
9190
return policy_loss, log_action_probabilities

utilities/Parallel_Experience_Generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class Parallel_Experience_Generator(object):
1616
def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None):
1717
self.use_GPU = use_GPU
1818
self.environment = environment
19-
self.action_types = "DISCRETE" if self.environment.action_space.dtype == int else "CONTINUOUS"
19+
self.action_types = "DISCRETE" if self.environment.action_space.dtype in [int, 'int64'] else "CONTINUOUS"
2020
self.action_size = action_size
2121
self.policy = policy
2222
self.action_choice_output_columns = action_choice_output_columns

utilities/data_structures/Replay_Buffer.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
class Replay_Buffer(object):
77
"""Replay buffer to store past experiences that the agent can then use for training data"""
88

9-
def __init__(self, buffer_size, batch_size, seed):
9+
def __init__(self, buffer_size, batch_size, seed, device=None):
1010

1111
self.memory = deque(maxlen=buffer_size)
1212
self.batch_size = batch_size
1313
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
1414
self.seed = random.seed(seed)
15-
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
15+
if device:
16+
self.device = torch.device(device)
17+
else:
18+
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
1619

1720
def add_experience(self, states, actions, rewards, next_states, dones):
1821
"""Adds experience(s) into the replay buffer"""
@@ -51,4 +54,4 @@ def pick_experiences(self, num_experiences=None):
5154
return random.sample(self.memory, k=batch_size)
5255

5356
def __len__(self):
54-
return len(self.memory)
57+
return len(self.memory)

0 commit comments

Comments
 (0)