Merge branch 'master' into patch-1

p-christ · web-flow · commit c2da672abe60 · 2020-09-19T10:14:38.000+01:00
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 ![RL](utilities/RL_image.jpeg)   ![PyTorch](utilities/PyTorch-logo-2.jpg)
 
-This repository contains PyTorch implementations of deep reinforcement learning algorithms and environments. 
+This repository contains PyTorch implementations of deep reinforcement learning algorithms and environments. (To help you remember things you learn about RL in general write them in [Save All](https://www.saveall.ai/)) 
 
 ## **Algorithms Implemented** 
 
@@ -110,7 +110,7 @@ conda activate myenvname
 
 pip3 install -r requirements.txt
 
-python Results/Cart_Pole.py
+python results/Cart_Pole.py
 ``` 
 
 For other games change the last line to one of the other files in the Results folder. 
@@ -120,4 +120,4 @@ For other games change the last line to one of the other files in the Results fo
 Most Open AI gym environments should work. All you would need to do is change the config.environment field (look at `Results/Cart_Pole.py`  for an example of this). 
 
 You can also play with your own custom game if you create a separate class that inherits from gym.Env. See `Environments/Four_Rooms_Environment.py`
-for an example of a custom environment and then see the script `Results/Four_Rooms.py` to see how to have agents play the environment.
+for an example of a custom environment and then see the script `Results/Four_Rooms.py` to see how to have agents play the environment.
diff --git a/agents/DQN_agents/DQN.py b/agents/DQN_agents/DQN.py
@@ -14,7 +14,7 @@ class DQN(Base_Agent):
     agent_name = "DQN"
     def __init__(self, config):
         Base_Agent.__init__(self, config)
-        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
+        self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed, self.device)
         self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
         self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
                                               lr=self.hyperparameters["learning_rate"], eps=1e-4)
@@ -112,4 +112,4 @@ def sample_experiences(self):
         """Draws a random sample of experience from the memory buffer"""
         experiences = self.memory.sample()
         states, actions, rewards, next_states, dones = experiences
-        return states, actions, rewards, next_states, dones
+        return states, actions, rewards, next_states, dones
diff --git a/agents/actor_critic_agents/SAC.py b/agents/actor_critic_agents/SAC.py
@@ -144,10 +144,12 @@ def learn(self):
         """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
         state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences()
         qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch)
+        self.update_critic_parameters(qf1_loss, qf2_loss)
+
         policy_loss, log_pi = self.calculate_actor_loss(state_batch)
         if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
         else: alpha_loss = None
-        self.update_all_parameters(qf1_loss, qf2_loss, policy_loss, alpha_loss)
+        self.update_actor_parameters(policy_loss, alpha_loss)
 
     def sample_experiences(self):
         return  self.memory.sample()
@@ -182,18 +184,21 @@ def calculate_entropy_tuning_loss(self, log_pi):
         alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
         return alpha_loss
 
-    def update_all_parameters(self, critic_loss_1, critic_loss_2, actor_loss, alpha_loss):
-        """Updates the parameters for the actor, both critics and (if specified) the temperature parameter"""
+    def update_critic_parameters(self, critic_loss_1, critic_loss_2):
+        """Updates the parameters for both critics"""
         self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1,
                                     self.hyperparameters["Critic"]["gradient_clipping_norm"])
         self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
                                     self.hyperparameters["Critic"]["gradient_clipping_norm"])
-        self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
-                                    self.hyperparameters["Actor"]["gradient_clipping_norm"])
         self.soft_update_of_target_network(self.critic_local, self.critic_target,
                                            self.hyperparameters["Critic"]["tau"])
         self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2,
                                            self.hyperparameters["Critic"]["tau"])
+
+    def update_actor_parameters(self, actor_loss, alpha_loss):
+        """Updates the parameters for the actor and (if specified) the temperature parameter"""
+        self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
+                                    self.hyperparameters["Actor"]["gradient_clipping_norm"])
         if alpha_loss is not None:
             self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None)
             self.alpha = self.log_alpha.exp()
diff --git a/agents/actor_critic_agents/SAC_Discrete.py b/agents/actor_critic_agents/SAC_Discrete.py
@@ -30,7 +30,7 @@ def __init__(self, config):
         Base_Agent.copy_model_over(self.critic_local, self.critic_target)
         Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
         self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
-                                    self.config.seed)
+                                    self.config.seed, device=self.device)
 
         self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
         self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
@@ -52,7 +52,7 @@ def produce_action_and_action_info(self, state):
         """Given the state, produces an action, the probability of the action, the log probability of the action, and
         the argmax action"""
         action_probabilities = self.actor_local(state)
-        max_probability_action = torch.argmax(action_probabilities, dim=-1).unsqueeze(0)
+        max_probability_action = torch.argmax(action_probabilities, dim=-1)
         action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
         action = action_distribution.sample().cpu()
         # Have to deal with situation of 0.0 probabilities because we can't do log 0
@@ -69,7 +69,7 @@ def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_
             qf1_next_target = self.critic_target(next_state_batch)
             qf2_next_target = self.critic_target_2(next_state_batch)
             min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
-            min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1)
+            min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)
             next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
 
         qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
@@ -85,7 +85,6 @@ def calculate_actor_loss(self, state_batch):
         qf2_pi = self.critic_local_2(state_batch)
         min_qf_pi = torch.min(qf1_pi, qf2_pi)
         inside_term = self.alpha * log_action_probabilities - min_qf_pi
-        policy_loss = action_probabilities * inside_term
-        policy_loss = policy_loss.mean()
+        policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
         log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
         return policy_loss, log_action_probabilities
diff --git a/utilities/Parallel_Experience_Generator.py b/utilities/Parallel_Experience_Generator.py
@@ -16,7 +16,7 @@ class Parallel_Experience_Generator(object):
     def __init__(self, environment, policy, seed, hyperparameters, action_size, use_GPU=False, action_choice_output_columns=None):
         self.use_GPU = use_GPU
         self.environment =  environment
-        self.action_types = "DISCRETE" if self.environment.action_space.dtype == int  else "CONTINUOUS"
+        self.action_types = "DISCRETE" if self.environment.action_space.dtype in [int, 'int64'] else "CONTINUOUS"
         self.action_size = action_size
         self.policy = policy
         self.action_choice_output_columns = action_choice_output_columns
diff --git a/utilities/data_structures/Replay_Buffer.py b/utilities/data_structures/Replay_Buffer.py
@@ -6,13 +6,16 @@
 class Replay_Buffer(object):
     """Replay buffer to store past experiences that the agent can then use for training data"""
     
-    def __init__(self, buffer_size, batch_size, seed):
+    def __init__(self, buffer_size, batch_size, seed, device=None):
 
         self.memory = deque(maxlen=buffer_size)
         self.batch_size = batch_size
         self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
         self.seed = random.seed(seed)
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        if device:
+            self.device = torch.device(device)
+        else:
+            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
     def add_experience(self, states, actions, rewards, next_states, dones):
         """Adds experience(s) into the replay buffer"""
@@ -51,4 +54,4 @@ def pick_experiences(self, num_experiences=None):
         return random.sample(self.memory, k=batch_size)
 
     def __len__(self):
-        return len(self.memory)
+        return len(self.memory)