From 783a2305651014c77e2931552674a4a3eb074217 Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Sat, 8 Jun 2024 21:13:37 +0000
Subject: [PATCH 1/6] [Docs] Update performance tuning guide

Added cuda graph explaination

Added core pinning section

Added tensor core usage section
---
 recipes_source/recipes/tuning_guide.py | 37 ++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 9f2c70a8921..95d30caf735 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -213,6 +213,7 @@ def gelu(x):
 
 ###############################################################################
 # Typically, the following environment variables are used to set for CPU affinity with GNU OpenMP implementation. ``OMP_PROC_BIND`` specifies whether threads may be moved between processors. Setting it to CLOSE keeps OpenMP threads close to the primary thread in contiguous place partitions. ``OMP_SCHEDULE`` determines how OpenMP threads are scheduled. ``GOMP_CPU_AFFINITY`` binds threads to specific CPUs.
+# An important tuning parameter is core pinning which prevent the threads of migrating between multiple CPUs, enhancing data location and minimizing inter core communication.
 #
 # .. code-block:: sh
 #
@@ -318,6 +319,42 @@ def gelu(x):
 # GPU specific optimizations
 # --------------------------
 
+###############################################################################
+# Enable Tensor cores
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# Tensor cores are specialized hardware to compute matrix-matrix multiplication
+# operations which neural network operation can take advantage of.
+#
+# Hardware tensor core operations tend to use a different floating point format
+# which sacrifices precision at expense of speed gains.
+# Prior to pytorch 1.12 this was enabled by default but since this version
+# it must be explicitly set as it can conflict with some operations which do not
+# benefit from Tensor core computations.
+
+## Tensor computation can be enabled "manually" modifying the matrix multiplication precision
+## The default precision is "highest" which will perform the operation according to the dtype
+
+# precision "high" and "medium" can be hardware accelerated via tensor cores
+
+# Carefully consider the tradeoff between speed and precision at the moment of evaluating your models!
+torch.set_float32_matmul_precision("high")
+
+###############################################################################
+# Use CUDA Graphs
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# At the time of using a GPU, work first must be launched from the CPU and
+# on some cases the context switch between CPU and GPU can lead to bad resource
+# utilization. CUDA graphs are a way to keep computation within the GPU without
+# paying the extra cost of kernel launches and host synchronization.
+
+# It can be enabled using 
+torch.compile(m, "reduce-overhead")
+# or
+torch.compile(m, "max-autotune")
+
+###############################################################################
+# Special care must be present when using cuda graphs as it can lead to increased memory consumption and some models might not compile.
+
 ###############################################################################
 # Enable cuDNN auto-tuner
 # ~~~~~~~~~~~~~~~~~~~~~~~

From 09d930d75efe8cd3ac1b3732aedb9b8dab749521 Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:16:06 -0500
Subject: [PATCH 2/6] Update recipes_source/recipes/tuning_guide.py

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/recipes/tuning_guide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 95d30caf735..fa9fcc202c7 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -322,7 +322,7 @@ def gelu(x):
 ###############################################################################
 # Enable Tensor cores
 # ~~~~~~~~~~~~~~~~~~~~~~~
-# Tensor cores are specialized hardware to compute matrix-matrix multiplication
+# Tensor cores are specialized hardware designed to compute matrix-matrix multiplication
 # operations which neural network operation can take advantage of.
 #
 # Hardware tensor core operations tend to use a different floating point format

From f2b023031ff265bbe8a25fb547117d41390195f6 Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:16:32 -0500
Subject: [PATCH 3/6] Update recipes_source/recipes/tuning_guide.py

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/recipes/tuning_guide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index fa9fcc202c7..af64d691860 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -323,7 +323,7 @@ def gelu(x):
 # Enable Tensor cores
 # ~~~~~~~~~~~~~~~~~~~~~~~
 # Tensor cores are specialized hardware designed to compute matrix-matrix multiplication
-# operations which neural network operation can take advantage of.
+# operations, which neural network operations can take advantage of.
 #
 # Hardware tensor core operations tend to use a different floating point format
 # which sacrifices precision at expense of speed gains.

From 6984b23aab1cc3c6b65f9a0fa64279768fb5010d Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:16:44 -0500
Subject: [PATCH 4/6] Update recipes_source/recipes/tuning_guide.py

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/recipes/tuning_guide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index af64d691860..06b7fb880ee 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -343,7 +343,7 @@ def gelu(x):
 # Use CUDA Graphs
 # ~~~~~~~~~~~~~~~~~~~~~~~
 # At the time of using a GPU, work first must be launched from the CPU and
-# on some cases the context switch between CPU and GPU can lead to bad resource
+# in some cases the context switch between CPU and GPU can lead to bad resource
 # utilization. CUDA graphs are a way to keep computation within the GPU without
 # paying the extra cost of kernel launches and host synchronization.
 

From 84107d90d21aa88ae5d1709d5ad525e2b4dd8510 Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Mon, 10 Jun 2024 12:19:50 -0500
Subject: [PATCH 5/6] Update recipes_source/recipes/tuning_guide.py

Co-authored-by: Svetlana Karslioglu <svekars@meta.com>
---
 recipes_source/recipes/tuning_guide.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 06b7fb880ee..3cf781e3225 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -327,7 +327,7 @@ def gelu(x):
 #
 # Hardware tensor core operations tend to use a different floating point format
 # which sacrifices precision at expense of speed gains.
-# Prior to pytorch 1.12 this was enabled by default but since this version
+# Prior to PyTorch 1.12 this functionality was enabled by default but since this version
 # it must be explicitly set as it can conflict with some operations which do not
 # benefit from Tensor core computations.
 

From 6f3ff6ff7edd6c18cfcf25d881e5ce0fddff127f Mon Sep 17 00:00:00 2001
From: Isaac David <61389980+orion160@users.noreply.github.com>
Date: Mon, 10 Jun 2024 21:43:14 +0000
Subject: [PATCH 6/6] Update Tensor & CUDA graph section

---
 recipes_source/recipes/tuning_guide.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py
index 3cf781e3225..dc1daae2584 100644
--- a/recipes_source/recipes/tuning_guide.py
+++ b/recipes_source/recipes/tuning_guide.py
@@ -323,21 +323,15 @@ def gelu(x):
 # Enable Tensor cores
 # ~~~~~~~~~~~~~~~~~~~~~~~
 # Tensor cores are specialized hardware designed to compute matrix-matrix multiplication
-# operations, which neural network operations can take advantage of.
+# operations, primarily utilized in deep learning and AI workloads. Tensor cores have
+# specific precision requirements which can be adjusted manually or via the Automatic
+# Mixed Precision API.
 #
-# Hardware tensor core operations tend to use a different floating point format
-# which sacrifices precision at expense of speed gains.
-# Prior to PyTorch 1.12 this functionality was enabled by default but since this version
-# it must be explicitly set as it can conflict with some operations which do not
-# benefit from Tensor core computations.
-
-## Tensor computation can be enabled "manually" modifying the matrix multiplication precision
-## The default precision is "highest" which will perform the operation according to the dtype
-
-# precision "high" and "medium" can be hardware accelerated via tensor cores
-
-# Carefully consider the tradeoff between speed and precision at the moment of evaluating your models!
-torch.set_float32_matmul_precision("high")
+# In particular, tensor operations take advantage of lower precision workloads.
+# Which can be controlled via ``torch.set_float32_matmul_precision``.
+# The default format is set to 'highest,' which utilizes the tensor data type. 
+# However, PyTorch offers alternative precision settings: 'high' and 'medium.'
+# These options prioritize computational speed over numerical precision."
 
 ###############################################################################
 # Use CUDA Graphs
@@ -353,7 +347,8 @@ def gelu(x):
 torch.compile(m, "max-autotune")
 
 ###############################################################################
-# Special care must be present when using cuda graphs as it can lead to increased memory consumption and some models might not compile.
+# Support for CUDA graph is in development, and its usage can incur in increased
+# device memory consumption and some models might not compile.
 
 ###############################################################################
 # Enable cuDNN auto-tuner