Skip to content

Commit 35e35f1

Browse files
committed
Add PyTorch Profiler in Tensorboard loging
1 parent 4822798 commit 35e35f1

File tree

6 files changed

+46
-21
lines changed

6 files changed

+46
-21
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ See our [paper](https://arxiv.org/pdf/2010.04642.pdf) at 3DV for an overview of
1616

1717
- CUDA 10 or higher (if you want GPU version)
1818
- Python 3.7 or higher + headers (python-dev)
19-
- PyTorch 1.7 or higher (PyTorch >= 1.9 is recommended)
19+
- PyTorch 1.8.1 or higher (PyTorch >= 1.9 is recommended)
2020
- A Sparse convolution backend (optional) see [here](https://github.com/nicolas-chaulet/torch-points3d#3d-sparse-convolution-support) for installation instructions
2121

2222
Install with

conf/training/default.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,4 @@ wandb:
4141
# parameters for TensorBoard Visualization
4242
tensorboard:
4343
log: True
44+
pytorch_profiler: True

docker/install_python.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@ python3 -m pip install -U pip
1010
pip3 install setuptools>=41.0.0
1111
if [ $1 == "gpu" ]; then
1212
echo "Install GPU"
13-
pip3 install torch==1.7.0 torchvision==0.8.1
13+
pip3 install torch==1.8.1 torchvision==0.8.1
1414
pip3 install MinkowskiEngine --install-option="--force_cuda" --install-option="--cuda_home=/usr/local/cuda"
1515
pip3 install git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0 -v
1616
pip3 install pycuda
1717
else
1818
echo "Install CPU"
19-
pip3 install torch==1.7.0+cpu torchvision==0.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
19+
pip3 install torch==1.8.1+cpu torchvision==0.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
2020
pip3 install MinkowskiEngine
2121
pip3 install git+https://github.com/mit-han-lab/torchsparse.git@v1.4.0
2222
fi

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ tqdm = "^4.40"
1818
open3d = "0.12.0"
1919
torchnet = "^0.0.4"
2020
tensorboard = "^2.1"
21-
torch = "^1.7.0"
21+
torch = "^1.8.1"
2222
torch-scatter = "^2.0.0"
2323
torch-sparse = "^0.6.10"
2424
torch-cluster = "^1.5.6"

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ tensorboard-plugin-wit==1.8.0
122122
terminado==0.10.0
123123
testpath==0.5.0
124124
threadpoolctl==2.1.0
125-
torch==1.7.0
125+
torch==1.8.1
126126
torch-cluster==1.5.9
127127
torch-geometric==1.7.1
128128
torch-points-kernels==0.7.0

torch_points3d/trainer.py

+40-16
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
from torch_points3d.utils.wandb_utils import Wandb
2727
from torch_points3d.visualization import Visualizer
2828

29+
# PyTorch Profiler import
30+
import torch.profiler
31+
from contextlib import nullcontext
32+
2933
log = logging.getLogger(__name__)
3034

3135

@@ -136,27 +140,37 @@ def _initialize_trainer(self):
136140
def train(self):
137141
self._is_training = True
138142

139-
for epoch in range(self._checkpoint.start_epoch, self._cfg.training.epochs):
140-
log.info("EPOCH %i / %i", epoch, self._cfg.training.epochs)
143+
with (torch.profiler.profile(
144+
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
145+
schedule=torch.profiler.schedule(skip_first=5, wait=0, warmup=1, active=3),
146+
on_trace_ready=torch.profiler.tensorboard_trace_handler(self._tracker._tensorboard_dir),
147+
record_shapes=True,
148+
profile_memory=True,
149+
with_stack=True
150+
) if self.pytorch_profiler_log else nullcontext()) as prof:
141151

142-
self._train_epoch(epoch)
152+
for epoch in range(self._checkpoint.start_epoch, self._cfg.training.epochs):
153+
log.info("EPOCH %i / %i", epoch, self._cfg.training.epochs)
143154

144-
if self.profiling:
145-
return 0
155+
with (torch.profiler.record_function('train_epoch') if self.pytorch_profiler_log else nullcontext()):
156+
self._train_epoch(epoch, prof)
146157

147-
if epoch % self.eval_frequency != 0:
148-
continue
158+
if self.profiling:
159+
return 0
149160

150-
if self._dataset.has_val_loader:
151-
self._test_epoch(epoch, "val")
161+
if epoch % self.eval_frequency != 0:
162+
continue
152163

153-
if self._dataset.has_test_loaders:
154-
self._test_epoch(epoch, "test")
164+
if self._dataset.has_val_loader:
165+
self._test_epoch(epoch, "val")
155166

156-
# Single test evaluation in resume case
157-
if self._checkpoint.start_epoch > self._cfg.training.epochs:
158-
if self._dataset.has_test_loaders:
159-
self._test_epoch(epoch, "test")
167+
if self._dataset.has_test_loaders:
168+
self._test_epoch(epoch, "test")
169+
170+
# Single test evaluation in resume case
171+
if self._checkpoint.start_epoch > self._cfg.training.epochs:
172+
if self._dataset.has_test_loaders:
173+
self._test_epoch(epoch, "test")
160174

161175
def eval(self, stage_name=""):
162176
self._is_training = False
@@ -180,7 +194,7 @@ def _finalize_epoch(self, epoch):
180194
if self._tracker._stage == "train":
181195
log.info("Learning rate = %f" % self._model.learning_rate)
182196

183-
def _train_epoch(self, epoch: int):
197+
def _train_epoch(self, epoch: int, prof: torch.profiler.profile):
184198

185199
self._model.train()
186200
self._tracker.reset("train")
@@ -210,6 +224,9 @@ def _train_epoch(self, epoch: int):
210224

211225
iter_data_time = time.time()
212226

227+
if self.pytorch_profiler_log:
228+
prof.step()
229+
213230
if self.early_break:
214231
break
215232

@@ -314,6 +331,13 @@ def tensorboard_log(self):
314331
else:
315332
return False
316333

334+
@property
335+
def pytorch_profiler_log(self):
336+
if self.tensorboard_log:
337+
return getattr(self._cfg.training.tensorboard, "pytorch_profiler", False)
338+
else:
339+
return False
340+
317341
@property
318342
def tracker_options(self):
319343
return self._cfg.get("tracker_options", {})

0 commit comments

Comments
 (0)