7
7
Introduction
8
8
------------
9
9
PyTorch 1.8 includes an updated profiler API capable of
10
- recording the CPU side operations as well as the CUDA kernel launches on the GPU side (ROCm AMD GPUs are not supported).
10
+ recording the CPU side operations as well as the CUDA kernel launches on the GPU side (`` AMD ROCm™`` GPUs are not supported).
11
11
The profiler can visualize this information
12
12
in TensorBoard Plugin and provide analysis of the performance bottlenecks.
13
13
57
57
# Transform it to the desired format and use ``DataLoader`` to load each batch.
58
58
59
59
transform = T .Compose (
60
- [T .Resize (224 ),
61
- T .ToTensor (),
62
- T .Normalize ((0.5 , 0.5 , 0.5 ), (0.5 , 0.5 , 0.5 ))])
63
- train_set = torchvision .datasets .CIFAR10 (root = './data' , train = True , download = True , transform = transform )
60
+ [T .Resize (224 ), T .ToTensor (), T .Normalize ((0.5 , 0.5 , 0.5 ), (0.5 , 0.5 , 0.5 ))]
61
+ )
62
+ train_set = torchvision .datasets .CIFAR10 (
63
+ root = "./data" , train = True , download = True , transform = transform
64
+ )
64
65
train_loader = torch .utils .data .DataLoader (train_set , batch_size = 32 , shuffle = True )
65
66
66
67
######################################################################
67
68
# Next, create Resnet model, loss function, and optimizer objects.
68
69
# To run on GPU, move model and loss to GPU device.
69
70
70
71
device = torch .device ("cuda:0" )
71
- model = torchvision .models .resnet18 (weights = ' IMAGENET1K_V1' ).cuda (device )
72
+ model = torchvision .models .resnet18 (weights = " IMAGENET1K_V1" ).cuda (device )
72
73
criterion = torch .nn .CrossEntropyLoss ().cuda (device )
73
74
optimizer = torch .optim .SGD (model .parameters (), lr = 0.001 , momentum = 0.9 )
74
75
model .train ()
77
78
######################################################################
78
79
# Define the training step for each batch of input data.
79
80
81
+
80
82
def train (data ):
81
83
inputs , labels = data [0 ].to (device = device ), data [1 ].to (device = device )
82
84
outputs = model (inputs )
@@ -120,11 +122,11 @@ def train(data):
120
122
# clicking a stack frame will navigate to the specific code line.
121
123
122
124
with torch .profiler .profile (
123
- schedule = torch .profiler .schedule (wait = 1 , warmup = 1 , active = 3 , repeat = 1 ),
124
- on_trace_ready = torch .profiler .tensorboard_trace_handler (' ./log/resnet18' ),
125
- record_shapes = True ,
126
- profile_memory = True ,
127
- with_stack = True
125
+ schedule = torch .profiler .schedule (wait = 1 , warmup = 1 , active = 3 , repeat = 1 ),
126
+ on_trace_ready = torch .profiler .tensorboard_trace_handler (" ./log/resnet18" ),
127
+ record_shapes = True ,
128
+ profile_memory = True ,
129
+ with_stack = True ,
128
130
) as prof :
129
131
for step , batch_data in enumerate (train_loader ):
130
132
prof .step () # Need to call this at each step to notify profiler of steps' boundary.
@@ -135,10 +137,11 @@ def train(data):
135
137
######################################################################
136
138
# Alternatively, the following non-context manager start/stop is supported as well.
137
139
prof = torch .profiler .profile (
138
- schedule = torch .profiler .schedule (wait = 1 , warmup = 1 , active = 3 , repeat = 1 ),
139
- on_trace_ready = torch .profiler .tensorboard_trace_handler ('./log/resnet18' ),
140
- record_shapes = True ,
141
- with_stack = True )
140
+ schedule = torch .profiler .schedule (wait = 1 , warmup = 1 , active = 3 , repeat = 1 ),
141
+ on_trace_ready = torch .profiler .tensorboard_trace_handler ("./log/resnet18" ),
142
+ record_shapes = True ,
143
+ with_stack = True ,
144
+ )
142
145
prof .start ()
143
146
for step , batch_data in enumerate (train_loader ):
144
147
prof .step ()
@@ -356,7 +359,7 @@ def train(data):
356
359
# ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an
357
360
# ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as
358
361
# ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration"
359
- # columns' data might be missing if the event occurs outside of the time range.
362
+ # columns' data might be missing if the event occurs outside of the time range.
360
363
#
361
364
# In the memory statistics table, the "Size Increase" column sums up all allocation size and minus all the memory
362
365
# release size, that is, the net increase of memory usage after this operator. The "Self Size Increase" column is
0 commit comments