From fe9814e72c3364b8c785e60d6d0ef9d717313fc2 Mon Sep 17 00:00:00 2001
From: Aryan Gupta <guptaaryan16@gmail.com>
Date: Sun, 12 Nov 2023 19:38:16 +0530
Subject: [PATCH 1/3] Add fix for ROCm GPU in pytorch profiler

---
 intermediate_source/tensorboard_profiler_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
index e27c123f8e9..21ea6120707 100644
--- a/intermediate_source/tensorboard_profiler_tutorial.py
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -7,7 +7,7 @@
 Introduction
 ------------
 PyTorch 1.8 includes an updated profiler API capable of
-recording the CPU side operations as well as the CUDA kernel launches on the GPU side.
+recording the CPU side operations as well as the CUDA kernel launches on the GPU side (ROCm AMD GPUs are not supported).
 The profiler can visualize this information
 in TensorBoard Plugin and provide analysis of the performance bottlenecks.
 

From 4c3dd3e91d8c400c2d714208be79b7ecc7f03753 Mon Sep 17 00:00:00 2001
From: Aryan Gupta <guptaaryan16@gmail.com>
Date: Tue, 14 Nov 2023 18:20:36 +0530
Subject: [PATCH 2/3] Fix CI spelling error

---
 .../tensorboard_profiler_tutorial.py          | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
index 21ea6120707..f04e18e00a8 100644
--- a/intermediate_source/tensorboard_profiler_tutorial.py
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -7,7 +7,7 @@
 Introduction
 ------------
 PyTorch 1.8 includes an updated profiler API capable of
-recording the CPU side operations as well as the CUDA kernel launches on the GPU side (ROCm AMD GPUs are not supported).
+recording the CPU side operations as well as the CUDA kernel launches on the GPU side (``AMD ROCm™`` GPUs are not supported).
 The profiler can visualize this information
 in TensorBoard Plugin and provide analysis of the performance bottlenecks.
 
@@ -57,10 +57,11 @@
 # Transform it to the desired format and use ``DataLoader`` to load each batch.
 
 transform = T.Compose(
-    [T.Resize(224),
-     T.ToTensor(),
-     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
+    [T.Resize(224), T.ToTensor(), T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
+)
+train_set = torchvision.datasets.CIFAR10(
+    root="./data", train=True, download=True, transform=transform
+)
 train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
 
 ######################################################################
@@ -68,7 +69,7 @@
 # To run on GPU, move model and loss to GPU device.
 
 device = torch.device("cuda:0")
-model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device)
+model = torchvision.models.resnet18(weights="IMAGENET1K_V1").cuda(device)
 criterion = torch.nn.CrossEntropyLoss().cuda(device)
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 model.train()
@@ -77,6 +78,7 @@
 ######################################################################
 # Define the training step for each batch of input data.
 
+
 def train(data):
     inputs, labels = data[0].to(device=device), data[1].to(device=device)
     outputs = model(inputs)
@@ -120,11 +122,11 @@ def train(data):
 #   clicking a stack frame will navigate to the specific code line.
 
 with torch.profiler.profile(
-        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True
+    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/resnet18"),
+    record_shapes=True,
+    profile_memory=True,
+    with_stack=True,
 ) as prof:
     for step, batch_data in enumerate(train_loader):
         prof.step()  # Need to call this at each step to notify profiler of steps' boundary.
@@ -135,10 +137,11 @@ def train(data):
 ######################################################################
 # Alternatively, the following non-context manager start/stop is supported as well.
 prof = torch.profiler.profile(
-        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
-        record_shapes=True,
-        with_stack=True)
+    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/resnet18"),
+    record_shapes=True,
+    with_stack=True,
+)
 prof.start()
 for step, batch_data in enumerate(train_loader):
     prof.step()
@@ -356,7 +359,7 @@ def train(data):
 # ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an
 # ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as
 # ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration"
-# columns' data might be missing if the event occurs outside of the time range. 
+# columns' data might be missing if the event occurs outside of the time range.
 #
 # In the memory statistics table, the "Size Increase" column sums up all allocation size and minus all the memory
 # release size, that is, the net increase of memory usage after this operator. The "Self Size Increase" column is

From 3aa5d06da2af1171e9147e023b92e995e76cc481 Mon Sep 17 00:00:00 2001
From: Aryan Gupta <guptaaryan16@gmail.com>
Date: Tue, 14 Nov 2023 23:32:25 +0530
Subject: [PATCH 3/3] Remove unnecessary changes

---
 .../tensorboard_profiler_tutorial.py          | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/intermediate_source/tensorboard_profiler_tutorial.py b/intermediate_source/tensorboard_profiler_tutorial.py
index f04e18e00a8..0923904c5a3 100644
--- a/intermediate_source/tensorboard_profiler_tutorial.py
+++ b/intermediate_source/tensorboard_profiler_tutorial.py
@@ -57,11 +57,10 @@
 # Transform it to the desired format and use ``DataLoader`` to load each batch.
 
 transform = T.Compose(
-    [T.Resize(224), T.ToTensor(), T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
-)
-train_set = torchvision.datasets.CIFAR10(
-    root="./data", train=True, download=True, transform=transform
-)
+    [T.Resize(224),
+     T.ToTensor(),
+     T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
 train_loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
 
 ######################################################################
@@ -69,7 +68,7 @@
 # To run on GPU, move model and loss to GPU device.
 
 device = torch.device("cuda:0")
-model = torchvision.models.resnet18(weights="IMAGENET1K_V1").cuda(device)
+model = torchvision.models.resnet18(weights='IMAGENET1K_V1').cuda(device)
 criterion = torch.nn.CrossEntropyLoss().cuda(device)
 optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
 model.train()
@@ -78,7 +77,6 @@
 ######################################################################
 # Define the training step for each batch of input data.
 
-
 def train(data):
     inputs, labels = data[0].to(device=device), data[1].to(device=device)
     outputs = model(inputs)
@@ -122,11 +120,11 @@ def train(data):
 #   clicking a stack frame will navigate to the specific code line.
 
 with torch.profiler.profile(
-    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/resnet18"),
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
+        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True
 ) as prof:
     for step, batch_data in enumerate(train_loader):
         prof.step()  # Need to call this at each step to notify profiler of steps' boundary.
@@ -137,11 +135,10 @@ def train(data):
 ######################################################################
 # Alternatively, the following non-context manager start/stop is supported as well.
 prof = torch.profiler.profile(
-    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/resnet18"),
-    record_shapes=True,
-    with_stack=True,
-)
+        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/resnet18'),
+        record_shapes=True,
+        with_stack=True)
 prof.start()
 for step, batch_data in enumerate(train_loader):
     prof.step()
@@ -359,7 +356,7 @@ def train(data):
 # ``aten::empty`` to allocate memory. For example, ``aten::ones`` is implemented as ``aten::empty`` followed by an
 # ``aten::fill_``. Solely display the operator name as ``aten::empty`` is of little help. It will be shown as
 # ``aten::ones (aten::empty)`` in this special case. The "Allocation Time", "Release Time" and "Duration"
-# columns' data might be missing if the event occurs outside of the time range.
+# columns' data might be missing if the event occurs outside of the time range. 
 #
 # In the memory statistics table, the "Size Increase" column sums up all allocation size and minus all the memory
 # release size, that is, the net increase of memory usage after this operator. The "Self Size Increase" column is