From 783a2305651014c77e2931552674a4a3eb074217 Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Sat, 8 Jun 2024 21:13:37 +0000 Subject: [PATCH 1/6] [Docs] Update performance tuning guide Added cuda graph explaination Added core pinning section Added tensor core usage section --- recipes_source/recipes/tuning_guide.py | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index 9f2c70a8921..95d30caf735 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -213,6 +213,7 @@ def gelu(x): ############################################################################### # Typically, the following environment variables are used to set for CPU affinity with GNU OpenMP implementation. ``OMP_PROC_BIND`` specifies whether threads may be moved between processors. Setting it to CLOSE keeps OpenMP threads close to the primary thread in contiguous place partitions. ``OMP_SCHEDULE`` determines how OpenMP threads are scheduled. ``GOMP_CPU_AFFINITY`` binds threads to specific CPUs. +# An important tuning parameter is core pinning which prevent the threads of migrating between multiple CPUs, enhancing data location and minimizing inter core communication. # # .. code-block:: sh # @@ -318,6 +319,42 @@ def gelu(x): # GPU specific optimizations # -------------------------- +############################################################################### +# Enable Tensor cores +# ~~~~~~~~~~~~~~~~~~~~~~~ +# Tensor cores are specialized hardware to compute matrix-matrix multiplication +# operations which neural network operation can take advantage of. +# +# Hardware tensor core operations tend to use a different floating point format +# which sacrifices precision at expense of speed gains. +# Prior to pytorch 1.12 this was enabled by default but since this version +# it must be explicitly set as it can conflict with some operations which do not +# benefit from Tensor core computations. + +## Tensor computation can be enabled "manually" modifying the matrix multiplication precision +## The default precision is "highest" which will perform the operation according to the dtype + +# precision "high" and "medium" can be hardware accelerated via tensor cores + +# Carefully consider the tradeoff between speed and precision at the moment of evaluating your models! +torch.set_float32_matmul_precision("high") + +############################################################################### +# Use CUDA Graphs +# ~~~~~~~~~~~~~~~~~~~~~~~ +# At the time of using a GPU, work first must be launched from the CPU and +# on some cases the context switch between CPU and GPU can lead to bad resource +# utilization. CUDA graphs are a way to keep computation within the GPU without +# paying the extra cost of kernel launches and host synchronization. + +# It can be enabled using +torch.compile(m, "reduce-overhead") +# or +torch.compile(m, "max-autotune") + +############################################################################### +# Special care must be present when using cuda graphs as it can lead to increased memory consumption and some models might not compile. + ############################################################################### # Enable cuDNN auto-tuner # ~~~~~~~~~~~~~~~~~~~~~~~ From 09d930d75efe8cd3ac1b3732aedb9b8dab749521 Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:16:06 -0500 Subject: [PATCH 2/6] Update recipes_source/recipes/tuning_guide.py Co-authored-by: Svetlana Karslioglu --- recipes_source/recipes/tuning_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index 95d30caf735..fa9fcc202c7 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -322,7 +322,7 @@ def gelu(x): ############################################################################### # Enable Tensor cores # ~~~~~~~~~~~~~~~~~~~~~~~ -# Tensor cores are specialized hardware to compute matrix-matrix multiplication +# Tensor cores are specialized hardware designed to compute matrix-matrix multiplication # operations which neural network operation can take advantage of. # # Hardware tensor core operations tend to use a different floating point format From f2b023031ff265bbe8a25fb547117d41390195f6 Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:16:32 -0500 Subject: [PATCH 3/6] Update recipes_source/recipes/tuning_guide.py Co-authored-by: Svetlana Karslioglu --- recipes_source/recipes/tuning_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index fa9fcc202c7..af64d691860 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -323,7 +323,7 @@ def gelu(x): # Enable Tensor cores # ~~~~~~~~~~~~~~~~~~~~~~~ # Tensor cores are specialized hardware designed to compute matrix-matrix multiplication -# operations which neural network operation can take advantage of. +# operations, which neural network operations can take advantage of. # # Hardware tensor core operations tend to use a different floating point format # which sacrifices precision at expense of speed gains. From 6984b23aab1cc3c6b65f9a0fa64279768fb5010d Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:16:44 -0500 Subject: [PATCH 4/6] Update recipes_source/recipes/tuning_guide.py Co-authored-by: Svetlana Karslioglu --- recipes_source/recipes/tuning_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index af64d691860..06b7fb880ee 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -343,7 +343,7 @@ def gelu(x): # Use CUDA Graphs # ~~~~~~~~~~~~~~~~~~~~~~~ # At the time of using a GPU, work first must be launched from the CPU and -# on some cases the context switch between CPU and GPU can lead to bad resource +# in some cases the context switch between CPU and GPU can lead to bad resource # utilization. CUDA graphs are a way to keep computation within the GPU without # paying the extra cost of kernel launches and host synchronization. From 84107d90d21aa88ae5d1709d5ad525e2b4dd8510 Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Mon, 10 Jun 2024 12:19:50 -0500 Subject: [PATCH 5/6] Update recipes_source/recipes/tuning_guide.py Co-authored-by: Svetlana Karslioglu --- recipes_source/recipes/tuning_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index 06b7fb880ee..3cf781e3225 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -327,7 +327,7 @@ def gelu(x): # # Hardware tensor core operations tend to use a different floating point format # which sacrifices precision at expense of speed gains. -# Prior to pytorch 1.12 this was enabled by default but since this version +# Prior to PyTorch 1.12 this functionality was enabled by default but since this version # it must be explicitly set as it can conflict with some operations which do not # benefit from Tensor core computations. From 6f3ff6ff7edd6c18cfcf25d881e5ce0fddff127f Mon Sep 17 00:00:00 2001 From: Isaac David <61389980+orion160@users.noreply.github.com> Date: Mon, 10 Jun 2024 21:43:14 +0000 Subject: [PATCH 6/6] Update Tensor & CUDA graph section --- recipes_source/recipes/tuning_guide.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index 3cf781e3225..dc1daae2584 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -323,21 +323,15 @@ def gelu(x): # Enable Tensor cores # ~~~~~~~~~~~~~~~~~~~~~~~ # Tensor cores are specialized hardware designed to compute matrix-matrix multiplication -# operations, which neural network operations can take advantage of. +# operations, primarily utilized in deep learning and AI workloads. Tensor cores have +# specific precision requirements which can be adjusted manually or via the Automatic +# Mixed Precision API. # -# Hardware tensor core operations tend to use a different floating point format -# which sacrifices precision at expense of speed gains. -# Prior to PyTorch 1.12 this functionality was enabled by default but since this version -# it must be explicitly set as it can conflict with some operations which do not -# benefit from Tensor core computations. - -## Tensor computation can be enabled "manually" modifying the matrix multiplication precision -## The default precision is "highest" which will perform the operation according to the dtype - -# precision "high" and "medium" can be hardware accelerated via tensor cores - -# Carefully consider the tradeoff between speed and precision at the moment of evaluating your models! -torch.set_float32_matmul_precision("high") +# In particular, tensor operations take advantage of lower precision workloads. +# Which can be controlled via ``torch.set_float32_matmul_precision``. +# The default format is set to 'highest,' which utilizes the tensor data type. +# However, PyTorch offers alternative precision settings: 'high' and 'medium.' +# These options prioritize computational speed over numerical precision." ############################################################################### # Use CUDA Graphs @@ -353,7 +347,8 @@ def gelu(x): torch.compile(m, "max-autotune") ############################################################################### -# Special care must be present when using cuda graphs as it can lead to increased memory consumption and some models might not compile. +# Support for CUDA graph is in development, and its usage can incur in increased +# device memory consumption and some models might not compile. ############################################################################### # Enable cuDNN auto-tuner