@@ -57,6 +57,8 @@ def setup_placeholders(self):
57
57
# summary placeholder
58
58
self .avg_episode_reward = tf .placeholder (
59
59
tf .float32 , [], 'avg_episode_reward' )
60
+ self .greedy_episode_reward = tf .placeholder (
61
+ tf .float32 , [], 'greedy_episode_reward' )
60
62
61
63
# sampling placeholders
62
64
self .internal_state = tf .placeholder (tf .float32 ,
@@ -118,12 +120,13 @@ def setup_placeholders(self):
118
120
self .prev_log_probs = tf .placeholder (tf .float32 , [None , None ],
119
121
'prev_log_probs' )
120
122
121
- def setup (self ):
123
+ def setup (self , train = True ):
122
124
"""Setup Tensorflow Graph."""
123
125
124
126
self .setup_placeholders ()
125
127
126
128
tf .summary .scalar ('avg_episode_reward' , self .avg_episode_reward )
129
+ tf .summary .scalar ('greedy_episode_reward' , self .greedy_episode_reward )
127
130
128
131
with tf .variable_scope ('model' , reuse = None ):
129
132
# policy network
@@ -174,45 +177,46 @@ def setup(self):
174
177
target_p .assign (aa * target_p + (1 - aa ) * online_p )
175
178
for online_p , target_p in zip (online_vars , target_vars )])
176
179
177
- # evaluate objective
178
- (self .loss , self .raw_loss , self .regression_target ,
179
- self .gradient_ops , self .summary ) = self .objective .get (
180
- self .rewards , self .pads ,
181
- self .values [:- 1 , :],
182
- self .values [- 1 , :] * (1 - self .terminated ),
183
- self .log_probs , self .prev_log_probs , self .target_log_probs ,
184
- self .entropies ,
185
- self .logits )
186
-
187
- self .regression_target = tf .reshape (self .regression_target , [- 1 ])
188
-
189
- self .policy_vars = [
190
- v for v in tf .trainable_variables ()
191
- if '/policy_net' in v .name ]
192
- self .value_vars = [
193
- v for v in tf .trainable_variables ()
194
- if '/value_net' in v .name ]
195
-
196
- # trust region optimizer
197
- if self .trust_region_policy_opt is not None :
198
- with tf .variable_scope ('trust_region_policy' , reuse = None ):
199
- avg_self_kl = (
200
- tf .reduce_sum (sum (self .self_kls ) * (1 - self .pads )) /
201
- tf .reduce_sum (1 - self .pads ))
202
-
203
- self .trust_region_policy_opt .setup (
204
- self .policy_vars , self .raw_loss , avg_self_kl ,
205
- self .avg_kl )
206
-
207
- # value optimizer
208
- if self .value_opt is not None :
209
- with tf .variable_scope ('trust_region_value' , reuse = None ):
210
- self .value_opt .setup (
211
- self .value_vars ,
212
- tf .reshape (self .values [:- 1 , :], [- 1 ]),
213
- self .regression_target ,
214
- tf .reshape (self .pads , [- 1 ]),
215
- self .regression_input , self .regression_weight )
180
+ if train :
181
+ # evaluate objective
182
+ (self .loss , self .raw_loss , self .regression_target ,
183
+ self .gradient_ops , self .summary ) = self .objective .get (
184
+ self .rewards , self .pads ,
185
+ self .values [:- 1 , :],
186
+ self .values [- 1 , :] * (1 - self .terminated ),
187
+ self .log_probs , self .prev_log_probs , self .target_log_probs ,
188
+ self .entropies , self .logits , self .target_values [:- 1 , :],
189
+ self .target_values [- 1 , :] * (1 - self .terminated ))
190
+
191
+ self .regression_target = tf .reshape (self .regression_target , [- 1 ])
192
+
193
+ self .policy_vars = [
194
+ v for v in tf .trainable_variables ()
195
+ if '/policy_net' in v .name ]
196
+ self .value_vars = [
197
+ v for v in tf .trainable_variables ()
198
+ if '/value_net' in v .name ]
199
+
200
+ # trust region optimizer
201
+ if self .trust_region_policy_opt is not None :
202
+ with tf .variable_scope ('trust_region_policy' , reuse = None ):
203
+ avg_self_kl = (
204
+ tf .reduce_sum (sum (self .self_kls ) * (1 - self .pads )) /
205
+ tf .reduce_sum (1 - self .pads ))
206
+
207
+ self .trust_region_policy_opt .setup (
208
+ self .policy_vars , self .raw_loss , avg_self_kl ,
209
+ self .avg_kl )
210
+
211
+ # value optimizer
212
+ if self .value_opt is not None :
213
+ with tf .variable_scope ('trust_region_value' , reuse = None ):
214
+ self .value_opt .setup (
215
+ self .value_vars ,
216
+ tf .reshape (self .values [:- 1 , :], [- 1 ]),
217
+ self .regression_target ,
218
+ tf .reshape (self .pads , [- 1 ]),
219
+ self .regression_input , self .regression_weight )
216
220
217
221
# we re-use variables for the sampling operations
218
222
with tf .variable_scope ('model' , reuse = True ):
@@ -249,32 +253,42 @@ def sample_step(self, sess,
249
253
def train_step (self , sess ,
250
254
observations , internal_state , actions ,
251
255
rewards , terminated , pads ,
252
- avg_episode_reward = 0 ):
256
+ avg_episode_reward = 0 , greedy_episode_reward = 0 ):
253
257
"""Train network using standard gradient descent."""
254
258
outputs = [self .raw_loss , self .gradient_ops , self .summary ]
255
259
feed_dict = {self .internal_state : internal_state ,
256
260
self .rewards : rewards ,
257
261
self .terminated : terminated ,
258
262
self .pads : pads ,
259
- self .avg_episode_reward : avg_episode_reward }
263
+ self .avg_episode_reward : avg_episode_reward ,
264
+ self .greedy_episode_reward : greedy_episode_reward }
265
+ time_len = None
260
266
for action_place , action in zip (self .actions , actions ):
267
+ if time_len is None :
268
+ time_len = len (action )
269
+ assert time_len == len (action )
261
270
feed_dict [action_place ] = action
262
271
for obs_place , obs in zip (self .observations , observations ):
272
+ assert time_len == len (obs )
263
273
feed_dict [obs_place ] = obs
264
274
275
+ assert len (rewards ) == time_len - 1
276
+
265
277
return sess .run (outputs , feed_dict = feed_dict )
266
278
267
279
268
280
def trust_region_step (self , sess ,
269
281
observations , internal_state , actions ,
270
282
rewards , terminated , pads ,
271
- avg_episode_reward = 0 ):
283
+ avg_episode_reward = 0 ,
284
+ greedy_episode_reward = 0 ):
272
285
"""Train policy using trust region step."""
273
286
feed_dict = {self .internal_state : internal_state ,
274
287
self .rewards : rewards ,
275
288
self .terminated : terminated ,
276
289
self .pads : pads ,
277
- self .avg_episode_reward : avg_episode_reward }
290
+ self .avg_episode_reward : avg_episode_reward ,
291
+ self .greedy_episode_reward : greedy_episode_reward }
278
292
for action_place , action in zip (self .actions , actions ):
279
293
feed_dict [action_place ] = action
280
294
for obs_place , obs in zip (self .observations , observations ):
0 commit comments