Skip to content

Commit bc8e8a8

Browse files
committed
SAC discrete updated
1 parent 039ea22 commit bc8e8a8

File tree

11 files changed

+28
-28
lines changed

11 files changed

+28
-28
lines changed

agents/Base_Agent.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import numpy as np
77
import torch
88
import time
9-
import tensorflow as tf
9+
# import tensorflow as tf
1010
from nn_builder.pytorch.NN import NN
1111
# from tensorboardX import SummaryWriter
1212
from torch.optim import optimizer
@@ -139,7 +139,7 @@ def set_random_seeds(self, random_seed):
139139
torch.backends.cudnn.deterministic = True
140140
torch.backends.cudnn.benchmark = False
141141
torch.manual_seed(random_seed)
142-
tf.set_random_seed(random_seed)
142+
# tf.set_random_seed(random_seed)
143143
random.seed(random_seed)
144144
np.random.seed(random_seed)
145145
if torch.cuda.is_available():

agents/DQN_agents/DQN.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, config):
1717
self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"], config.seed)
1818
self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
1919
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
20-
lr=self.hyperparameters["learning_rate"])
20+
lr=self.hyperparameters["learning_rate"], eps=1e-4)
2121
self.exploration_strategy = Epsilon_Greedy_Exploration(config)
2222

2323
def reset_game(self):

agents/DQN_agents/Dueling_DDQN.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import torch
22
from torch import optim
33
from agents.Base_Agent import Base_Agent
4-
from .DDQN import DDQN
4+
from agents.DQN_agents.DDQN import DDQN
55

66
class Dueling_DDQN(DDQN):
77
"""A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf"""
@@ -10,7 +10,7 @@ class Dueling_DDQN(DDQN):
1010
def __init__(self, config):
1111
DDQN.__init__(self, config)
1212
self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
13-
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"])
13+
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
1414
self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
1515
Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
1616

agents/actor_critic_agents/A2C.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from .A3C import A3C
1+
from agents.actor_critic_agents.A3C import A3C
22

33
class A2C(A3C):
44
"""Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only

agents/actor_critic_agents/A3C.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def __init__(self, config):
1717
self.num_processes = multiprocessing.cpu_count()
1818
self.worker_processes = max(1, self.num_processes - 2)
1919
self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1])
20-
self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"])
20+
self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
2121

2222
def run_n_episodes(self):
2323
"""Runs game to completion n times and then summarises results and saves model (if asked to)"""
@@ -89,7 +89,7 @@ def __init__(self, worker_num, environment, shared_model, counter, optimizer_loc
8989
self.set_seeds(self.worker_num)
9090
self.shared_model = shared_model
9191
self.local_model = local_model
92-
self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0)
92+
self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0, eps=1e-4)
9393
self.counter = counter
9494
self.optimizer_lock = optimizer_lock
9595
self.shared_optimizer = shared_optimizer

agents/actor_critic_agents/DDPG.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ def __init__(self, config):
1717
Base_Agent.copy_model_over(self.critic_local, self.critic_target)
1818

1919
self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
20-
lr=self.hyperparameters["Critic"]["learning_rate"])
20+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
2121
self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
2222
self.config.seed)
2323
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
2424
self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
2525
Base_Agent.copy_model_over(self.actor_local, self.actor_target)
2626

2727
self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
28-
lr=self.hyperparameters["Actor"]["learning_rate"])
28+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
2929
self.exploration_strategy = OU_Noise_Exploration(self.config)
3030

3131
def step(self):

agents/actor_critic_agents/SAC.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ def __init__(self, config):
2626
self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
2727
key_to_use="Critic", override_seed=self.config.seed + 1)
2828
self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
29-
lr=self.hyperparameters["Critic"]["learning_rate"])
29+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
3030
self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
31-
lr=self.hyperparameters["Critic"]["learning_rate"])
31+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
3232
self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
3333
key_to_use="Critic")
3434
self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
@@ -39,13 +39,13 @@ def __init__(self, config):
3939
self.config.seed)
4040
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
4141
self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
42-
lr=self.hyperparameters["Actor"]["learning_rate"])
42+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
4343
self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
4444
if self.automatic_entropy_tuning:
4545
self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
4646
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
4747
self.alpha = self.log_alpha.exp()
48-
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"])
48+
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
4949
else:
5050
self.alpha = self.hyperparameters["entropy_term_weight"]
5151

agents/actor_critic_agents/SAC_Discrete.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from agents.Base_Agent import Base_Agent
66
from utilities.data_structures.Replay_Buffer import Replay_Buffer
7-
from .SAC import SAC
7+
from agents.actor_critic_agents.SAC import SAC
88
from utilities.Utility_Functions import create_actor_distribution
99

1010
class SAC_Discrete(SAC):
@@ -20,9 +20,9 @@ def __init__(self, config):
2020
self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
2121
key_to_use="Critic", override_seed=self.config.seed + 1)
2222
self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
23-
lr=self.hyperparameters["Critic"]["learning_rate"])
23+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
2424
self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
25-
lr=self.hyperparameters["Critic"]["learning_rate"])
25+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
2626
self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
2727
key_to_use="Critic")
2828
self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
@@ -34,14 +34,14 @@ def __init__(self, config):
3434

3535
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
3636
self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
37-
lr=self.hyperparameters["Actor"]["learning_rate"])
37+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
3838
self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
3939
if self.automatic_entropy_tuning:
4040
# we set the max possible entropy as the target entropy
4141
self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
4242
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
4343
self.alpha = self.log_alpha.exp()
44-
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"])
44+
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
4545
else:
4646
self.alpha = self.hyperparameters["entropy_term_weight"]
4747
assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
@@ -65,11 +65,11 @@ def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_
6565
"""Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
6666
term is taken into account"""
6767
with torch.no_grad():
68-
next_state_action, (_, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
69-
next_state_log_pi = log_action_probabilities.gather(1, next_state_action.unsqueeze(-1).long())
70-
qf1_next_target = self.critic_target(next_state_batch).gather(1, next_state_action.unsqueeze(-1).long())
71-
qf2_next_target = self.critic_target_2(next_state_batch).gather(1, next_state_action.unsqueeze(-1).long())
72-
min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
68+
next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
69+
qf1_next_target = self.critic_target(next_state_batch)
70+
qf2_next_target = self.critic_target_2(next_state_batch)
71+
min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
72+
min_qf_next_target = min_qf_next_target.mean(dim=1).unsqueeze(-1)
7373
next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
7474
self.critic_target(next_state_batch).gather(1, next_state_action.unsqueeze(-1).long())
7575

agents/actor_critic_agents/TD3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def __init__(self, config):
1818
key_to_use="Critic")
1919
Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
2020
self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
21-
lr=self.hyperparameters["Critic"]["learning_rate"])
21+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
2222
self.exploration_strategy_critic = Gaussian_Exploration(self.config)
2323

2424
def compute_critic_values_for_next_states(self, next_states):

agents/policy_gradient_agents/PPO.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def __init__(self, config):
1818
self.policy_new = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size)
1919
self.policy_old = self.create_NN(input_dim=self.state_size, output_dim=self.policy_output_size)
2020
self.policy_old.load_state_dict(copy.deepcopy(self.policy_new.state_dict()))
21-
self.policy_new_optimizer = optim.Adam(self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"])
21+
self.policy_new_optimizer = optim.Adam(self.policy_new.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
2222
self.episode_number = 0
2323
self.many_episode_states = []
2424
self.many_episode_actions = []

0 commit comments

Comments
 (0)