Skip to content

Commit c617f98

Browse files
authored
Clean up staging tmp checkpoint directory (#28848)
clean up remaining tmp checkpoint dir Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
1 parent 136cd89 commit c617f98

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

src/transformers/trainer.py

+4
Original file line numberDiff line numberDiff line change
@@ -2468,6 +2468,10 @@ def _save_checkpoint(self, model, trial, metrics=None):
24682468
# Solely rely on numerical checkpoint id for rotation.
24692469
# mtime is not reliable especially on some fuse fs in cloud environments.
24702470
self._rotate_checkpoints(use_mtime=False, output_dir=run_dir)
2471+
elif self.is_local_process_zero():
2472+
# Clean up the remaining staging checkpoint folders on other nodes
2473+
if staging_output_dir != output_dir and os.path.exists(staging_output_dir):
2474+
shutil.rmtree(staging_output_dir)
24712475

24722476
self.args.distributed_state.wait_for_everyone()
24732477

0 commit comments

Comments
 (0)