Skip to content

Commit 8a89152

Browse files
committed
updates to pcl_rl according to most recent version of Trust-PCL paper
1 parent 1887a5f commit 8a89152

7 files changed

+182
-96
lines changed

research/pcl_rl/README.md

+18-11
Original file line numberDiff line numberDiff line change
@@ -67,20 +67,27 @@ python trainer.py --logtostderr --batch_size=25 --env=HalfCheetah-v1 \
6767
--max_divergence=0.05 --value_opt=best_fit --critic_weight=0.0 \
6868
```
6969

70-
Run Mujoco task with Trust-PCL:
70+
To run Mujoco task using Trust-PCL (off-policy) use the below command.
71+
It should work well across all environments, given that you
72+
search sufficiently among
73+
74+
(1) max_divergence (0.001, 0.0005, 0.002 are good values),
75+
76+
(2) rollout (1, 5, 10 are good values),
77+
78+
(3) tf_seed (need to average over enough random seeds).
7179

7280
```
7381
python trainer.py --logtostderr --batch_size=1 --env=HalfCheetah-v1 \
74-
--validation_frequency=50 --rollout=10 --critic_weight=0.0 \
75-
--gamma=0.995 --clip_norm=40 --learning_rate=0.002 \
76-
--replay_buffer_freq=1 --replay_buffer_size=20000 \
77-
--replay_buffer_alpha=0.1 --norecurrent --objective=pcl \
78-
--max_step=100 --tau=0.0 --eviction=fifo --max_divergence=0.001 \
79-
--internal_dim=64 --cutoff_agent=1000 \
80-
--replay_batch_size=25 --nouse_online_batch --batch_by_steps \
81-
--sample_from=target --value_opt=grad --value_hidden_layers=2 \
82-
--update_eps_lambda --unify_episodes --clip_adv=1.0 \
83-
--target_network_lag=0.99 --prioritize_by=step
82+
--validation_frequency=250 --rollout=1 --critic_weight=1.0 --gamma=0.995 \
83+
--clip_norm=40 --learning_rate=0.0001 --replay_buffer_freq=1 \
84+
--replay_buffer_size=5000 --replay_buffer_alpha=0.001 --norecurrent \
85+
--objective=pcl --max_step=10 --cutoff_agent=1000 --tau=0.0 --eviction=fifo \
86+
--max_divergence=0.001 --internal_dim=256 --replay_batch_size=64 \
87+
--nouse_online_batch --batch_by_steps --value_hidden_layers=2 \
88+
--update_eps_lambda --nounify_episodes --target_network_lag=0.99 \
89+
--sample_from=online --clip_adv=1 --prioritize_by=step --num_steps=1000000 \
90+
--noinput_prev_actions --use_target_values --tf_seed=57
8491
```
8592

8693
Run Mujoco task with PCL constraint trust region:

research/pcl_rl/controller.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -109,13 +109,14 @@ def __init__(self, env, env_spec, internal_dim,
109109
self.episode_running_rewards = np.zeros(len(self.env))
110110
self.episode_running_lengths = np.zeros(len(self.env))
111111
self.episode_rewards = []
112+
self.greedy_episode_rewards = []
112113
self.episode_lengths = []
113114
self.total_rewards = []
114115

115116
self.best_batch_rewards = None
116117

117-
def setup(self):
118-
self.model.setup()
118+
def setup(self, train=True):
119+
self.model.setup(train=train)
119120

120121
def initial_internal_state(self):
121122
return np.zeros(self.model.policy.rnn_state_dim)
@@ -187,7 +188,7 @@ def _sample_episodes(self, sess, greedy=False):
187188

188189
return initial_state, all_obs, all_act, rewards, all_pad
189190

190-
def sample_episodes(self, sess):
191+
def sample_episodes(self, sess, greedy=False):
191192
"""Sample steps from the environment until we have enough for a batch."""
192193

193194
# check if last batch ended with episode that was not terminated
@@ -200,7 +201,7 @@ def sample_episodes(self, sess):
200201
while total_steps < self.max_step * len(self.env):
201202
(initial_state,
202203
observations, actions, rewards,
203-
pads) = self._sample_episodes(sess)
204+
pads) = self._sample_episodes(sess, greedy=greedy)
204205

205206
observations = zip(*observations)
206207
actions = zip(*actions)
@@ -249,19 +250,26 @@ def _train(self, sess,
249250
observations, initial_state, actions,
250251
rewards, terminated, pads):
251252
"""Train model using batch."""
253+
avg_episode_reward = np.mean(self.episode_rewards)
254+
greedy_episode_reward = (np.mean(self.greedy_episode_rewards)
255+
if self.greedy_episode_rewards else
256+
avg_episode_reward)
257+
loss, summary = None, None
252258
if self.use_trust_region:
253259
# use trust region to optimize policy
254260
loss, _, summary = self.model.trust_region_step(
255261
sess,
256262
observations, initial_state, actions,
257263
rewards, terminated, pads,
258-
avg_episode_reward=np.mean(self.episode_rewards))
264+
avg_episode_reward=avg_episode_reward,
265+
greedy_episode_reward=greedy_episode_reward)
259266
else: # otherwise use simple gradient descent on policy
260267
loss, _, summary = self.model.train_step(
261268
sess,
262269
observations, initial_state, actions,
263270
rewards, terminated, pads,
264-
avg_episode_reward=np.mean(self.episode_rewards))
271+
avg_episode_reward=avg_episode_reward,
272+
greedy_episode_reward=greedy_episode_reward)
265273

266274
if self.use_value_opt: # optionally perform specific value optimization
267275
self.model.fit_values(
@@ -305,7 +313,8 @@ def train(self, sess):
305313
if self.update_eps_lambda:
306314
episode_rewards = np.array(self.episode_rewards)
307315
episode_lengths = np.array(self.episode_lengths)
308-
eps_lambda = find_best_eps_lambda(episode_rewards, episode_lengths)
316+
eps_lambda = find_best_eps_lambda(
317+
episode_rewards[-20:], episode_lengths[-20:])
309318
sess.run(self.model.objective.assign_eps_lambda,
310319
feed_dict={self.model.objective.new_eps_lambda: eps_lambda})
311320

@@ -328,10 +337,10 @@ def eval(self, sess):
328337
"""Use greedy sampling."""
329338
(initial_state,
330339
observations, actions, rewards,
331-
pads) = self._sample_episodes(sess, greedy=True)
340+
pads, terminated) = self.sample_episodes(sess, greedy=True)
332341

333342
total_rewards = np.sum(np.array(rewards) * (1 - np.array(pads)), axis=0)
334-
return np.mean(total_rewards)
343+
return total_rewards, self.episode_rewards
335344

336345
def convert_from_batched_episodes(
337346
self, initial_state, observations, actions, rewards,
@@ -351,7 +360,7 @@ def convert_from_batched_episodes(
351360
for i in xrange(num_episodes):
352361
length = total_length[i]
353362
ep_initial = initial_state[i]
354-
ep_obs = [obs[:length, i, ...] for obs in observations]
363+
ep_obs = [obs[:length + 1, i, ...] for obs in observations]
355364
ep_act = [act[:length + 1, i, ...] for act in actions]
356365
ep_rewards = rewards[:length, i]
357366

research/pcl_rl/full_episode_objective.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def get_bonus(self, total_rewards, total_log_probs):
4242

4343
def get(self, rewards, pads, values, final_values,
4444
log_probs, prev_log_probs, target_log_probs,
45-
entropies, logits):
45+
entropies, logits,
46+
target_values, final_target_values):
4647
seq_length = tf.shape(rewards)[0]
4748

4849
not_pad = tf.reshape(1 - pads, [seq_length, -1, self.num_samples])

research/pcl_rl/model.py

+58-44
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ def setup_placeholders(self):
5757
# summary placeholder
5858
self.avg_episode_reward = tf.placeholder(
5959
tf.float32, [], 'avg_episode_reward')
60+
self.greedy_episode_reward = tf.placeholder(
61+
tf.float32, [], 'greedy_episode_reward')
6062

6163
# sampling placeholders
6264
self.internal_state = tf.placeholder(tf.float32,
@@ -118,12 +120,13 @@ def setup_placeholders(self):
118120
self.prev_log_probs = tf.placeholder(tf.float32, [None, None],
119121
'prev_log_probs')
120122

121-
def setup(self):
123+
def setup(self, train=True):
122124
"""Setup Tensorflow Graph."""
123125

124126
self.setup_placeholders()
125127

126128
tf.summary.scalar('avg_episode_reward', self.avg_episode_reward)
129+
tf.summary.scalar('greedy_episode_reward', self.greedy_episode_reward)
127130

128131
with tf.variable_scope('model', reuse=None):
129132
# policy network
@@ -174,45 +177,46 @@ def setup(self):
174177
target_p.assign(aa * target_p + (1 - aa) * online_p)
175178
for online_p, target_p in zip(online_vars, target_vars)])
176179

177-
# evaluate objective
178-
(self.loss, self.raw_loss, self.regression_target,
179-
self.gradient_ops, self.summary) = self.objective.get(
180-
self.rewards, self.pads,
181-
self.values[:-1, :],
182-
self.values[-1, :] * (1 - self.terminated),
183-
self.log_probs, self.prev_log_probs, self.target_log_probs,
184-
self.entropies,
185-
self.logits)
186-
187-
self.regression_target = tf.reshape(self.regression_target, [-1])
188-
189-
self.policy_vars = [
190-
v for v in tf.trainable_variables()
191-
if '/policy_net' in v.name]
192-
self.value_vars = [
193-
v for v in tf.trainable_variables()
194-
if '/value_net' in v.name]
195-
196-
# trust region optimizer
197-
if self.trust_region_policy_opt is not None:
198-
with tf.variable_scope('trust_region_policy', reuse=None):
199-
avg_self_kl = (
200-
tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
201-
tf.reduce_sum(1 - self.pads))
202-
203-
self.trust_region_policy_opt.setup(
204-
self.policy_vars, self.raw_loss, avg_self_kl,
205-
self.avg_kl)
206-
207-
# value optimizer
208-
if self.value_opt is not None:
209-
with tf.variable_scope('trust_region_value', reuse=None):
210-
self.value_opt.setup(
211-
self.value_vars,
212-
tf.reshape(self.values[:-1, :], [-1]),
213-
self.regression_target,
214-
tf.reshape(self.pads, [-1]),
215-
self.regression_input, self.regression_weight)
180+
if train:
181+
# evaluate objective
182+
(self.loss, self.raw_loss, self.regression_target,
183+
self.gradient_ops, self.summary) = self.objective.get(
184+
self.rewards, self.pads,
185+
self.values[:-1, :],
186+
self.values[-1, :] * (1 - self.terminated),
187+
self.log_probs, self.prev_log_probs, self.target_log_probs,
188+
self.entropies, self.logits, self.target_values[:-1, :],
189+
self.target_values[-1, :] * (1 - self.terminated))
190+
191+
self.regression_target = tf.reshape(self.regression_target, [-1])
192+
193+
self.policy_vars = [
194+
v for v in tf.trainable_variables()
195+
if '/policy_net' in v.name]
196+
self.value_vars = [
197+
v for v in tf.trainable_variables()
198+
if '/value_net' in v.name]
199+
200+
# trust region optimizer
201+
if self.trust_region_policy_opt is not None:
202+
with tf.variable_scope('trust_region_policy', reuse=None):
203+
avg_self_kl = (
204+
tf.reduce_sum(sum(self.self_kls) * (1 - self.pads)) /
205+
tf.reduce_sum(1 - self.pads))
206+
207+
self.trust_region_policy_opt.setup(
208+
self.policy_vars, self.raw_loss, avg_self_kl,
209+
self.avg_kl)
210+
211+
# value optimizer
212+
if self.value_opt is not None:
213+
with tf.variable_scope('trust_region_value', reuse=None):
214+
self.value_opt.setup(
215+
self.value_vars,
216+
tf.reshape(self.values[:-1, :], [-1]),
217+
self.regression_target,
218+
tf.reshape(self.pads, [-1]),
219+
self.regression_input, self.regression_weight)
216220

217221
# we re-use variables for the sampling operations
218222
with tf.variable_scope('model', reuse=True):
@@ -249,32 +253,42 @@ def sample_step(self, sess,
249253
def train_step(self, sess,
250254
observations, internal_state, actions,
251255
rewards, terminated, pads,
252-
avg_episode_reward=0):
256+
avg_episode_reward=0, greedy_episode_reward=0):
253257
"""Train network using standard gradient descent."""
254258
outputs = [self.raw_loss, self.gradient_ops, self.summary]
255259
feed_dict = {self.internal_state: internal_state,
256260
self.rewards: rewards,
257261
self.terminated: terminated,
258262
self.pads: pads,
259-
self.avg_episode_reward: avg_episode_reward}
263+
self.avg_episode_reward: avg_episode_reward,
264+
self.greedy_episode_reward: greedy_episode_reward}
265+
time_len = None
260266
for action_place, action in zip(self.actions, actions):
267+
if time_len is None:
268+
time_len = len(action)
269+
assert time_len == len(action)
261270
feed_dict[action_place] = action
262271
for obs_place, obs in zip(self.observations, observations):
272+
assert time_len == len(obs)
263273
feed_dict[obs_place] = obs
264274

275+
assert len(rewards) == time_len - 1
276+
265277
return sess.run(outputs, feed_dict=feed_dict)
266278

267279

268280
def trust_region_step(self, sess,
269281
observations, internal_state, actions,
270282
rewards, terminated, pads,
271-
avg_episode_reward=0):
283+
avg_episode_reward=0,
284+
greedy_episode_reward=0):
272285
"""Train policy using trust region step."""
273286
feed_dict = {self.internal_state: internal_state,
274287
self.rewards: rewards,
275288
self.terminated: terminated,
276289
self.pads: pads,
277-
self.avg_episode_reward: avg_episode_reward}
290+
self.avg_episode_reward: avg_episode_reward,
291+
self.greedy_episode_reward: greedy_episode_reward}
278292
for action_place, action in zip(self.actions, actions):
279293
feed_dict[action_place] = action
280294
for obs_place, obs in zip(self.observations, observations):

0 commit comments

Comments
 (0)