docs/intermediate/torchserve_with_ipex_2.html



<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="ko" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="ko" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  <meta property="og:title" content="Grokking PyTorch Intel CPU performance from first principles (Part 2)" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://tutorials.pytorch.kr/intermediate/torchserve_with_ipex_2.html" />
<meta property="og:site_name" content="PyTorch Tutorials KR" />
<meta property="og:description" content="Authors: Min Jean Cho, Jing Xu, Mark Saroufim In the Grokking PyTorch Intel CPU Performance From First Principles tutorial , we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into TorchServe for optimized CPU performance. In this tutorial, w..." />
<meta property="og:image" content="https://tutorials.pytorch.kr/_static/logos/logo-kr-sm-dark.png" />
<meta property="og:image:alt" content="PyTorch Tutorials KR" />
<meta name="description" content="Authors: Min Jean Cho, Jing Xu, Mark Saroufim In the Grokking PyTorch Intel CPU Performance From First Principles tutorial , we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into TorchServe for optimized CPU performance. In this tutorial, w..." />
<meta property="og:ignore_canonical" content="true" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0">

  <title>Grokking PyTorch Intel CPU performance from first principles (Part 2) &mdash; 파이토치 한국어 튜토리얼 (PyTorch tutorials in Korean)</title>


    <link rel="shortcut icon" href="../_static/favicon.ico"/>


  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <!-- <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> -->
  <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="../_static/katex-math.css" type="text/css" />
  <link rel="stylesheet" href="../_static/sg_gallery.css" type="text/css" />
  <link rel="stylesheet" href="../_static/sg_gallery-binder.css" type="text/css" />
  <link rel="stylesheet" href="../_static/sg_gallery-dataframe.css" type="text/css" />
  <link rel="stylesheet" href="../_static/sg_gallery-rendered-html.css" type="text/css" />
  <link rel="stylesheet" href="../_static/sphinx-design.5ea377869091fd0449014c60fc090103.min.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
  <link rel="stylesheet" href="../_static/css/custom2.css" type="text/css" />
    <link rel="index" title="색인" href="../genindex.html" />
    <link rel="search" title="검색" href="../search.html" />
    <link rel="next" title="Getting Started - Accelerate Your Scripts with nvFuser" href="nvfuser_intro_tutorial.html" />
    <link rel="prev" title="Grokking PyTorch Intel CPU performance from first principles" href="torchserve_with_ipex.html" />


  <script src="../_static/js/modernizr.min.js"></script>

  <!-- Preload the theme fonts -->

<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">

<!-- Preload the katex fonts -->

<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.2/css/all.css" integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>

<div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
      <a class="header-logo" href="https://pytorch.kr/" aria-label="PyTorch"></a>

      <div class="main-menu">
        <ul>
          <li>
            <a href="https://pytorch.kr/get-started">시작하기</a>
          </li>

          <li class="active">
            <a href="https://tutorials.pytorch.kr/">튜토리얼</a>
          </li>

          <li>
            <a href="https://pytorch.kr/hub">허브</a>
          </li>

          <li>
            <a href="https://discuss.pytorch.kr/">커뮤니티</a>
          </li>
        </ul>
      </div>

      <a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
    </div>
  </div>
</div>

<body class="pytorch-body">


    <div class="table-of-contents-link-wrapper">
      <span>Table of Contents</span>
      <a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
    </div>

    <nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
      <div class="pytorch-side-scroll">
        <div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          <div class="pytorch-left-menu-search">


                <div class="version">
                  2.3.1+cu121
                </div>


<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search Tutorials" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>


          </div>


              <p class="caption" role="heading"><span class="caption-text">파이토치(PyTorch) 레시피</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../recipes/recipes_index.html">모든 레시피 보기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prototype/prototype_index.html">모든 프로토타입 레시피 보기</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">파이토치(PyTorch) 시작하기</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/intro.html">파이토치(PyTorch) 기본 익히기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/quickstart_tutorial.html">빠른 시작(Quickstart)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/tensorqs_tutorial.html">텐서(Tensor)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/data_tutorial.html">Dataset과 DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/transforms_tutorial.html">변형(Transform)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/buildmodel_tutorial.html">신경망 모델 구성하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/autogradqs_tutorial.html"><code class="docutils literal notranslate"><span class="pre">torch.autograd</span></code>를 사용한 자동 미분</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/optimization_tutorial.html">모델 매개변수 최적화하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/saveloadrun_tutorial.html">모델 저장하고 불러오기</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Introduction to PyTorch on YouTube</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt.html">PyTorch 소개 - YouTube 시리즈</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/introyt1_tutorial.html">PyTorch 소개</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensors_deeper_tutorial.html">Pytorch Tensor 소개</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/autogradyt_tutorial.html">The Fundamentals of Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/modelsyt_tutorial.html">Building Models with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensorboardyt_tutorial.html">PyTorch TensorBoard Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/trainingyt.html">Training with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/captumyt.html">Model Understanding with Captum</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">파이토치(PyTorch) 배우기</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/deep_learning_60min_blitz.html">PyTorch로 딥러닝하기: 60분만에 끝장내기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/pytorch_with_examples.html">예제로 배우는 파이토치(PyTorch)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/nn_tutorial.html"><cite>torch.nn</cite> 이 <em>실제로</em> 무엇인가요?</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_tutorial.html">TensorBoard로 모델, 데이터, 학습 시각화하기</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">이미지/비디오</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchvision_tutorial.html">TorchVision Object Detection Finetuning Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/transfer_learning_tutorial.html">컴퓨터 비전(Vision)을 위한 전이학습(Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/fgsm_tutorial.html">적대적 예제 생성(Adversarial Example Generation)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dcgan_faces_tutorial.html">DCGAN 튜토리얼</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">배포를 위해 비전 트랜스포머(Vision Transformer) 모델 최적화하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="tiatoolbox_tutorial.html">Whole Slide Image Classification Using PyTorch and TIAToolbox</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">오디오</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_io_tutorial.html">Audio I/O</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_resampling_tutorial.html">Audio Resampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_data_augmentation_tutorial.html">Audio Data Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_extractions_tutorial.html">Audio Feature Extractions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_augmentation_tutorial.html">Audio Feature Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_datasets_tutorial.html">Audio Datasets</a></li>
<li class="toctree-l1"><a class="reference internal" href="speech_recognition_pipeline_tutorial.html">Speech Recognition with Wav2Vec2</a></li>
<li class="toctree-l1"><a class="reference internal" href="text_to_speech_with_torchaudio.html">Text-to-speech with Tacotron2</a></li>
<li class="toctree-l1"><a class="reference internal" href="forced_alignment_with_torchaudio_tutorial.html">wav2vec2을 이용한 강제 정렬</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">텍스트</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/bettertransformer_tutorial.html">Fast Transformer Inference with Better Transformer</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_classification_tutorial.html">기초부터 시작하는 NLP: 문자-단위 RNN으로 이름 분류하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_generation_tutorial.html">기초부터 시작하는 NLP:  문자-단위 RNN으로 이름 생성하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="seq2seq_translation_tutorial.html">기초부터 시작하는 NLP: Sequence to Sequence 네트워크와 Attention을 이용한 번역</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/text_sentiment_ngrams_tutorial.html">torchtext 라이브러리로 텍스트 분류하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/translation_transformer.html"><code class="docutils literal notranslate"><span class="pre">nn.Transformer</span></code> 와 torchtext로 언어 번역하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/torchtext_custom_dataset_tutorial.html">Preprocess custom text dataset using Torchtext</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">백엔드</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">강화학습</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_q_learning.html">강화 학습 (DQN) 튜토리얼</a></li>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_ppo.html">Reinforcement Learning (PPO) with TorchRL Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="mario_rl_tutorial.html">마리오 게임 RL 에이전트로 학습하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/pendulum.html">Pendulum: Writing your environment and transforms with TorchRL</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch 모델을 프로덕션 환경에 배포하기</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
<li class="toctree-l1"><a class="reference internal" href="flask_rest_api_tutorial.html">Flask를 사용하여 Python에서 PyTorch를 REST API로 배포하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/Intro_to_TorchScript_tutorial.html">TorchScript 소개</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_export.html">C++에서 TorchScript 모델 로딩하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/super_resolution_with_onnxruntime.html">(선택) PyTorch 모델을 ONNX으로 변환하고 ONNX 런타임에서 실행하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="realtime_rpi.html">Raspberry Pi 4 에서 실시간 추론(Inference) (30fps!)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch 프로파일링</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch 모듈 프로파일링하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_intro_tutorial.html">Introduction to Holistic Trace Analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_trace_diff_tutorial.html">Trace Diff using Holistic Trace Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Code Transforms with FX</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="fx_conv_bn_fuser.html">(베타) FX에서 합성곱/배치 정규화(Convolution/Batch Norm) 결합기(Fuser) 만들기</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx_profiling_tutorial.html">(beta) Building a Simple CPU Performance Profiler with FX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">프론트엔드 API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="memory_format_tutorial.html">(베타) PyTorch를 사용한 Channels Last 메모리 형식</a></li>
<li class="toctree-l1"><a class="reference internal" href="forward_ad_usage.html">Forward-mode Automatic Differentiation (Beta)</a></li>
<li class="toctree-l1"><a class="reference internal" href="jacobians_hessians.html">Jacobians, Hessians, hvp, vhp, and more: composing function transforms</a></li>
<li class="toctree-l1"><a class="reference internal" href="ensembling.html">모델 앙상블</a></li>
<li class="toctree-l1"><a class="reference internal" href="per_sample_grads.html">Per-sample-gradients</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_frontend.html">PyTorch C++ 프론트엔드 사용하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch-script-parallelism.html">TorchScript의 동적 병렬 처리(Dynamic Parallelism)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_autograd.html">C++ 프론트엔드의 자동 미분 (autograd)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch 확장하기</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="custom_function_double_backward_tutorial.html">Double Backward with Custom Functions</a></li>
<li class="toctree-l1"><a class="reference internal" href="custom_function_conv_bn_tutorial.html">Fusing Convolution and Batch Norm using Custom Function</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_extension.html">Custom C++ and CUDA Extensions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_ops.html">Extending TorchScript with Custom C++ Operators</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_classes.html">커스텀 C++ 클래스로 TorchScript 확장하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dispatcher.html">Registering a Dispatched Operator in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/extend_dispatcher.html">Extending dispatcher for a new backend in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/privateuseone.html">Facilitating New Backend Integration by PrivateUse1</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">모델 최적화</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch 모듈 프로파일링하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_profiler_tutorial.html">텐서보드를 이용한 파이토치 프로파일러</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hyperparameter_tuning_tutorial.html">Ray Tune을 사용한 하이퍼파라미터 튜닝</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">배포를 위해 비전 트랜스포머(Vision Transformer) 모델 최적화하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="parametrizations.html">Parametrizations Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="pruning_tutorial.html">가지치기 기법(Pruning) 튜토리얼</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dynamic_quantization_tutorial.html">(베타) LSTM 기반 단어 단위 언어 모델의 동적 양자화</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamic_quantization_bert_tutorial.html">(베타) BERT 모델 동적 양자화하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantized_transfer_learning_tutorial.html">(베타) 컴퓨터 비전 튜토리얼을 위한 양자화된 전이학습(Quantized Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/static_quantization_tutorial.html">(베타) PyTorch에서 Eager Mode를 이용한 정적 양자화</a></li>
<li class="toctree-l1"><a class="reference internal" href="torchserve_with_ipex.html">Grokking PyTorch Intel CPU performance from first principles</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles (Part 2)</a></li>
<li class="toctree-l1"><a class="reference internal" href="nvfuser_intro_tutorial.html">Getting Started - Accelerate Your Scripts with nvFuser</a></li>
<li class="toctree-l1"><a class="reference internal" href="ax_multiobjective_nas_tutorial.html">Multi-Objective NAS with Ax</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_compile_tutorial.html">Introduction to <code class="docutils literal notranslate"><span class="pre">torch.compile</span></code></a></li>
<li class="toctree-l1"><a class="reference internal" href="inductor_debug_cpu.html">Inductor CPU backend debugging and profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html">(Beta) Scaled Dot Product Attention (SDPA)로 고성능 트랜스포머(Transformers) 구현하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#torch-compile-sdpa"><code class="docutils literal notranslate"><span class="pre">torch.compile</span></code> 과 함께 SDPA 사용하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#sdpa-atteition-bias">SDPA를 <code class="docutils literal notranslate"><span class="pre">atteition.bias</span></code> 하위 클래스와 사용하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#id8">결론</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/knowledge_distillation_tutorial.html">Knowledge Distillation Tutorial</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">병렬 및 분산 학습</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../distributed/home.html">Distributed and Parallel Training Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dist_overview.html">PyTorch Distributed Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/ddp_series_intro.html">Distributed Data Parallel in PyTorch - Video Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_parallel_tutorial.html">단일 머신을 사용한 모델 병렬화 모범 사례</a></li>
<li class="toctree-l1"><a class="reference internal" href="ddp_tutorial.html">분산 데이터 병렬 처리 시작하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_tuto.html">PyTorch로 분산 어플리케이션 개발하기</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_tutorial.html">Getting Started with Fully Sharded Data Parallel(FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_adavnced_tutorial.html">Advanced Model Training with Fully Sharded Data Parallel (FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="TP_tutorial.html">Large Scale Transformer model training with Tensor Parallel (TP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="process_group_cpp_extension_tutorial.html">Cpp 확장을 사용한 프로세스 그룹 백엔드 사용자 정의</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_tutorial.html">Getting Started with Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_param_server_tutorial.html">Implementing a Parameter Server Using Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_pipeline_parallel_tutorial.html">Distributed Pipeline Parallelism Using RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_async_execution.html">Implementing Batch RPC Processing Using Asynchronous Executions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/rpc_ddp_tutorial.html">분산 데이터 병렬(DDP)과 분산 RPC 프레임워크 결합</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/ddp_pipeline.html">분산 데이터 병렬 처리와 병렬 처리 파이프라인을 사용한 트랜스포머 모델 학습</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/generic_join.html">Distributed Training with Uneven Inputs Using the Join Context Manager</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Edge with ExecuTorch</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html">Exporting to ExecuTorch Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href=" https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html">Running an ExecuTorch Model in C++ Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/sdk-integration-tutorial.html">Using the ExecuTorch SDK to Profile a Model</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-ios.html">Building an ExecuTorch iOS Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-android.html">Building an ExecuTorch Android Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/examples-end-to-end-to-lower-model-to-delegate.html">Lowering a Model as a Delegate</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">추천 시스템</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchrec_tutorial.html">TorchRec 소개</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/sharding.html">Exploring TorchRec sharding</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Multimodality</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/flava_finetuning_tutorial.html">TorchMultimodal 튜토리얼: FLAVA 미세조정</a></li>
</ul>


        </div>
      </div>
    </nav>

    <div class="pytorch-container">
      <div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
        <div class="pytorch-breadcrumbs-wrapper">


<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="pytorch-breadcrumbs">

      <li>
        <a href="../index.html">

            Tutorials

        </a> &gt;
      </li>


      <li>Grokking PyTorch Intel CPU performance from first principles (Part 2)</li>


      <li class="pytorch-breadcrumbs-aside">


            <a href="../_sources/intermediate/torchserve_with_ipex_2.rst.txt" rel="nofollow"><img src="../_static/images/view-page-source-icon.svg"></a>


      </li>

  </ul>


</div>
        </div>

        <div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
          Shortcuts
        </div>
      </div>

      <section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
        <div class="pytorch-content-left">


          <div class="pytorch-call-to-action-links">
            <div id="tutorial-type">intermediate/torchserve_with_ipex_2</div>

            <div id="google-colab-link">
              <img class="call-to-action-img" src="../_static/images/pytorch-colab.svg"/>
              <div class="call-to-action-desktop-view">Run in Google Colab</div>
              <div class="call-to-action-mobile-view">Colab</div>
            </div>
            <div id="download-notebook-link">
              <img class="call-to-action-notebook-img" src="../_static/images/pytorch-download.svg"/>
              <div class="call-to-action-desktop-view">Download Notebook</div>
              <div class="call-to-action-mobile-view">Notebook</div>
            </div>
            <div id="github-view-link">
              <img class="call-to-action-img" src="../_static/images/pytorch-github.svg"/>
              <div class="call-to-action-desktop-view">View on GitHub</div>
              <div class="call-to-action-mobile-view">GitHub</div>
            </div>
          </div>


          <div class="rst-content">

            <div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
             <article itemprop="articleBody" id="pytorch-article" class="pytorch-article">

  <div class="section" id="grokking-pytorch-intel-cpu-performance-from-first-principles-part-2">
<h1>Grokking PyTorch Intel CPU performance from first principles (Part 2)<a class="headerlink" href="#grokking-pytorch-intel-cpu-performance-from-first-principles-part-2" title="이 제목에 대한 퍼머링크">¶</a></h1>
<p>Authors: <a class="reference external" href="https://github.com/min-jean-cho">Min Jean Cho</a>, <a class="reference external" href="https://github.com/jingxu10">Jing Xu</a>, <a class="reference external" href="https://github.com/msaroufim">Mark Saroufim</a></p>
<p>In the <a class="reference external" href="https://tutorials.pytorch.kr/intermediate/torchserve_with_ipex.html">Grokking PyTorch Intel CPU Performance From First Principles</a> tutorial
, we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into <a class="reference external" href="https://github.com/pytorch/serve">TorchServe</a> for optimized CPU performance.</p>
<p>In this tutorial, we will demonstrate boosting performance with memory allocator via the <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md">Intel® Extension for PyTorch* Launcher</a>
, and optimized kernels on CPU via <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intel® Extension for PyTorch*</a>
, and apply them to TorchServe showcasing 7.71x throughput speedup for ResNet50 and 2.20x throughput speedup for BERT.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/1.png"><img alt="../_images/1.png" src="../_images/1.png" style="width: 100%;" /></a>
</div>
<div class="section" id="prerequisites">
<h2>Prerequisites<a class="headerlink" href="#prerequisites" title="이 제목에 대한 퍼머링크">¶</a></h2>
<p>Throughout this tutorial, we will use <a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html">Top-down Microarchitecture Analysis (TMA)</a> to profile and show that the Back End Bound (Memory Bound, Core Bound) is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound. We will use  <a class="reference external" href="https://github.com/andikleen/pmu-tools/wiki/toplev-manual">toplev</a>, a tool part of <a class="reference external" href="https://github.com/andikleen/pmu-tools">pmu-tools</a> built on top of <a class="reference external" href="https://man7.org/linux/man-pages/man1/perf.1.html">Linux perf</a>, for TMA.</p>
<p>We will also use <a class="reference external" href="https://github.com/pytorch/pytorch/issues/41001">Intel® VTune™ Profiler’s Instrumentation and Tracing Technology (ITT)</a> to profile at finer granularity.</p>
<div class="section" id="top-down-microarchitecture-analysis-method-tma">
<h3>Top-down Microarchitecture Analysis Method (TMA)<a class="headerlink" href="#top-down-microarchitecture-analysis-method-tma" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>When tuning CPU for optimal performance, it’s useful to know where the bottleneck is. Most CPU cores have on-chip Performance Monitoring Units (PMUs). PMUs are dedicated pieces of logic within a CPU core that count specific hardware events as they occur on the system. Examples of these events may be Cache Misses or Branch Mispredictions. PMUs are used for Top-down Microarchitecture Analysis (TMA) to identify the bottlenecks. TMA consists of hierarchical levels as shown:</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/26.png"><img alt="../_images/26.png" src="../_images/26.png" style="width: 100%;" /></a>
</div>
<p>The top level, level-1, metrics collect <em>Retiring</em>, <em>Bad Speculation</em>, <em>Front End Bound</em>, <em>Back End Bound</em>. The pipeline of CPU can conceptually be simplified and divided into two: the frontend and the backend. The <em>frontend</em> is responsible for fetching the program code and decoding them into low-level hardware operations called micro-ops (uOps). The uOps are then fed to the <em>backend</em> in a process called allocation. Once allocated, the backend is responsible for executing the uOp in an available execution unit. A completion of uOp’s execution is called <em>retirement</em>. In contrast, a <em>bad speculation</em> is when speculatively fetched uOps are canceled before retiring such as in the case of mispredicted branches. Each of these metrics can further be broken down in the subsequent levels to pinpoint the bottleneck.</p>
<div class="section" id="tune-for-the-back-end-bound">
<h4>Tune for the Back End Bound<a class="headerlink" href="#tune-for-the-back-end-bound" title="이 제목에 대한 퍼머링크">¶</a></h4>
<p>The majority of untuned deep learning workloads will be Back End Bound. Resolving Back End bound is often resolving sources of latency causing retirement to take longer than necessary. As shown above, Back End Bound has two sub-metrics – Core Bound and Memory Bound.</p>
<p>Memory Bound stalls have causes related to the memory subsystem. For example, last-level cache (LLC or L3 cache) miss causing access to DRAM. Scaling deep learning models often requires significant compute. And high compute utilization requires that data is available when the execution units need it to execute the uOps. This requires prefetching the data and reusing the data in cache instead of fetching that same data multiple times from main memory which causes execution units to be starved while data is being returned. Throughout this tutorial, we wll show that a more efficient memory allocator, operator fusion, memory layout format optimization reduce overhead on Memory Bound with better cache locality.</p>
<p>Core Bound stalls indicate sub-optimal use of available execution units while there are no uncompleted memory accesses. For example, several general matrix-matrix multiplication (GEMM) instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units could cause Core Bound stalls. Key deep learning kernels, including the DP kernels, have been well optimized by <a class="reference external" href="https://github.com/oneapi-src/oneDNN">oneDNN library</a> (oneAPI Deep Neural Network Library), reducing overhead on Core Bound.</p>
<p>Operations like GEMM, convolution, deconvolution are compute-intensive. While operations like pooling, batch normalization, activation functions like ReLU are memory-bound.</p>
</div>
</div>
<div class="section" id="intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt">
<h3>Intel® VTune™ Profiler’s Instrumentation and Tracing Technology (ITT)<a class="headerlink" href="#intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>The ITT APIs of Intel® VTune Profiler is a useful tool to annotate a region of your workload for tracing to profile and visualize at a finer granularity of your annotation – OP/function/sub-function granularity. By annotating at the granularity of your PyTorch model’s OPs, Intel® VTune Profiler’s ITT enables op-level profiling. Intel® VTune Profiler’s ITT has been integrated into <a class="reference external" href="https://tutorials.pytorch.kr/beginner/introyt/autogradyt_tutorial.html#autograd-profiler">PyTorch Autograd Profiler</a>. <sup>1</sup></p>
<ol class="arabic simple">
<li><p>The feature has to be explicitly enabled by <em>with torch.autograd.profiler.emit_itt()</em>.</p></li>
</ol>
</div>
</div>
<div class="section" id="torchserve-with-intel-extension-for-pytorch">
<h2>TorchServe with Intel® Extension for PyTorch*<a class="headerlink" href="#torchserve-with-intel-extension-for-pytorch" title="이 제목에 대한 퍼머링크">¶</a></h2>
<p><a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intel® Extension for PyTorch*</a> is a Python package to extend PyTorch with optimizations for extra performance boost on Intel hardware.</p>
<p>Intel® Extension for PyTorch* has already been integrated into TorchServe to improve the performance out-of-box. <sup>2</sup> For custom handler scripts, we recommend adding the <em>intel_extension_for_pytorch</em> package in.</p>
<ol class="arabic simple" start="2">
<li><p>The feature has to be explicitly enabled by setting <em>ipex_enable=true</em> in <em>config.properties</em>.</p></li>
</ol>
<p>Throughout this section, we will show that Back End Bound is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intel® Extension for PyTorch* for improving Back End Bound, which has two submetrics - Memory Bound, and Core Bound. A more efficient memory allocator, operator fusion, memory layout format optimization improve Memory Bound. Ideally, Memory Bound can be improved to Core Bound by optimized operators and better cache locality. And key deep learning primitives, such as convolution, matrix multiplication, dot-product, have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound.</p>
<div class="section" id="leveraging-advanced-launcher-configuration-memory-allocator">
<h3>Leveraging Advanced Launcher Configuration: Memory Allocator<a class="headerlink" href="#leveraging-advanced-launcher-configuration-memory-allocator" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>Memory allocator plays an important role from performance perspective. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus faster execution. For deep learning workloads in practice, especially those running on large multi-core systems or servers like TorchServe, TCMalloc, or JeMalloc can generally get better memory usage than the default PyTorch memory allocator, PTMalloc.</p>
<div class="section" id="tcmalloc-jemalloc-ptmalloc">
<h4>TCMalloc, JeMalloc, PTMalloc<a class="headerlink" href="#tcmalloc-jemalloc-ptmalloc" title="이 제목에 대한 퍼머링크">¶</a></h4>
<p>Both TCMalloc and JeMalloc use thread-local caches to reduce overhead on thread synchronization, and lock contention by using spinlocks and per-thread arenas respectively. TCMalloc and JeMalloc reduce overhead on unnecessary memory allocation and deallocation. Both allocators categorize memory allocations by sizes to reduce overhead on memory fragmentation.</p>
<p>With the launcher, users can easily experiment with different memory allocators by choosing one of the three launcher knobs <em>–enable_tcmalloc</em> (TCMalloc), <em>–enable_jemalloc</em> (JeMalloc), <em>–use_default_allocator</em> (PTMalloc).</p>
<div class="section" id="exercise">
<h5>Exercise<a class="headerlink" href="#exercise" title="이 제목에 대한 퍼머링크">¶</a></h5>
<p>Let’s profile PTMalloc vs. JeMalloc.</p>
<p>We will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket to avoid any NUMA complication – to profile the effect of memory allocator only.</p>
<p>The following example measures the average inference time of ResNet50:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torchvision.models</span> <span class="k">as</span> <span class="nn">models</span>
<span class="kn">import</span> <span class="nn">time</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">models</span><span class="o">.</span><span class="n">resnet50</span><span class="p">(</span><span class="n">pretrained</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">32</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>

<span class="c1"># warm up</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
    <span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>

<span class="c1"># measure</span>
<span class="c1"># Intel® VTune Profiler&#39;s ITT context manager</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">emit_itt</span><span class="p">():</span>
    <span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
   <span class="c1"># Intel® VTune Profiler&#39;s ITT to annotate each step</span>
        <span class="n">torch</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">itt</span><span class="o">.</span><span class="n">range_push</span><span class="p">(</span><span class="s1">&#39;step_</span><span class="si">{}</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
        <span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
        <span class="n">torch</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">itt</span><span class="o">.</span><span class="n">range_pop</span><span class="p">()</span>
    <span class="n">end</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>

<span class="nb">print</span><span class="p">(</span><span class="s1">&#39;Inference took </span><span class="si">{:.2f}</span><span class="s1"> ms in average&#39;</span><span class="o">.</span><span class="n">format</span><span class="p">((</span><span class="n">end</span><span class="o">-</span><span class="n">start</span><span class="p">)</span><span class="o">/</span><span class="mi">100</span><span class="o">*</span><span class="mi">1000</span><span class="p">))</span>
</pre></div>
</div>
<p>Let’s collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/32.png"><img alt="../_images/32.png" src="../_images/32.png" style="width: 100%;" /></a>
</div>
<p>Level-1 TMA shows that both PTMalloc and JeMalloc are bounded by the backend. More than half of the execution time was stalled by the backend. Let’s go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/41.png"><img alt="../_images/41.png" src="../_images/41.png" style="width: 100%;" /></a>
</div>
<p>Level-2 TMA shows that the Back End Bound was caused by Memory Bound. Let’s go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/51.png"><img alt="../_images/51.png" src="../_images/51.png" style="width: 100%;" /></a>
</div>
<p>Most of the metrics under the Memory Bound identify which level of the memory hierarchy from the L1 cache to main memory is the bottleneck. A hotspot bounded at a given level indicates that most of the data was being retrieved from that cache or memory-level. Optimizations should focus on moving data closer to the core. Level-3 TMA shows that PTMalloc was bottlenecked by DRAM Bound. On the other hand, JeMalloc was bottlenecked by L1 Bound – JeMalloc moved data closer to the core, and thus faster execution.</p>
<p>Let’s look at Intel® VTune Profiler ITT trace. In the example script, we have annotated each <em>step_x</em> of the inference loop.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/61.png"><img alt="../_images/61.png" src="../_images/61.png" style="width: 100%;" /></a>
</div>
<p>Each step is traced in the timeline graph. The duration of model inference on the last step (step_99) decreased from 304.308 ms to 261.843 ms.</p>
</div>
<div class="section" id="exercise-with-torchserve">
<h5>Exercise with TorchServe<a class="headerlink" href="#exercise-with-torchserve" title="이 제목에 대한 퍼머링크">¶</a></h5>
<p>Let’s profile PTMalloc vs. JeMalloc with TorchServe.</p>
<p>We will use <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench">TorchServe apache-bench benchmarking</a> with ResNet50 FP32, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">default parameters</a>.</p>
<p>As in the previous exercise, we will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in <a class="reference external" href="https://pytorch.org/serve/configuration.html#config-properties-file">config.properties</a>:</p>
<p>PTMalloc</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span> <span class="o">--</span><span class="n">use_default_allocator</span>
</pre></div>
</div>
<p>JeMalloc</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span> <span class="o">--</span><span class="n">enable_jemalloc</span>
</pre></div>
</div>
<p>Let’s collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/71.png"><img alt="../_images/71.png" src="../_images/71.png" style="width: 100%;" /></a>
</div>
<p>Let’s go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/81.png"><img alt="../_images/81.png" src="../_images/81.png" style="width: 100%;" /></a>
</div>
<p>Let’s use Intel® VTune Profiler ITT to annotate <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py#L188">TorchServe inference scope</a> to profile at inference-level granularity. As <a class="reference external" href="https://github.com/pytorch/serve/blob/master/docs/internals.md#torchserve-architecture">TorchServe Architecture</a> consists of several sub-components, including the Java frontend for handling request/response, and the Python backend for running the actual inference on the models, it is helpful to use Intel® VTune Profiler ITT to limit the collection of trace data at inference-level.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/9.png"><img alt="../_images/9.png" src="../_images/9.png" style="width: 100%;" /></a>
</div>
<p>Each inference call is traced in the timeline graph. The duration of the last model inference decreased from 561.688 ms to 251.287 ms - 2.2x speedup.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/101.png"><img alt="../_images/101.png" src="../_images/101.png" style="width: 100%;" /></a>
</div>
<p>The timeline graph can be expanded to see op-level profiling results. The duration of <em>aten::conv2d</em> decreased from 16.401 ms to 6.392 ms - 2.6x speedup.</p>
<p>In this section, we have demonstrated that JeMalloc can give better performance than the default PyTorch memory allocator, PTMalloc, with efficient thread-local caches improving Back-End-Bound.</p>
</div>
</div>
</div>
<div class="section" id="id1">
<h3>Intel® Extension for PyTorch*<a class="headerlink" href="#id1" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>The three major <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intel® Extension for PyTorch*</a> optimization techniques, Operator, Graph, Runtime, are as shown:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 46%" />
<col style="width: 33%" />
<col style="width: 21%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head" colspan="3"><p>Intel® Extension for PyTorch* Optimization Techniques</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Operator</p></td>
<td><p>Graph</p></td>
<td><p>Runtime</p></td>
</tr>
<tr class="row-odd"><td><ul class="simple">
<li><p>Vectorization and Multi-threading</p></li>
<li><p>Low-precision BF16/INT8 compute</p></li>
<li><p>Data layout optimization for better cache locality</p></li>
</ul>
</td>
<td><ul class="simple">
<li><p>Constant folding to reduce compute</p></li>
<li><p>Op fusion for better cache locality</p></li>
</ul>
</td>
<td><ul class="simple">
<li><p>Thread affinitization</p></li>
<li><p>Memory buffer pooling</p></li>
<li><p>GPU runtime</p></li>
<li><p>Launcher</p></li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="section" id="operator-optimization">
<h4>Operator Optimization<a class="headerlink" href="#operator-optimization" title="이 제목에 대한 퍼머링크">¶</a></h4>
<p>Optimized operators and kernels are registered through PyTorch dispatching mechanism. These operators and kernels are accelerated from native vectorization feature and matrix calculation feature of Intel hardware. During execution, Intel® Extension for PyTorch* intercepts invocation of ATen operators, and replaces the original ones with these optimized ones. Popular operators like Convolution, Linear have been optimized in Intel® Extension for PyTorch*.</p>
<div class="section" id="id2">
<h5>Exercise<a class="headerlink" href="#id2" title="이 제목에 대한 퍼머링크">¶</a></h5>
<p>Let’s profile optimized operator with Intel® Extension for PyTorch*. We will compare with and without the lines in code changes.</p>
<p>As in the previous exercises, we will bind the workload to physical cores of the first socket.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>

<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
        <span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>

    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">x</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>

<span class="c1">#################### code changes ####################</span>
<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="c1">######################################################</span>

<span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>The model consists of two operations—Conv2d and ReLU. By printing the model object, we get the following output.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/11.png"><img alt="../_images/11.png" src="../_images/11.png" style="width: 60%;" /></a>
</div>
<p>Let’s collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/121.png"><img alt="../_images/121.png" src="../_images/121.png" style="width: 100%;" /></a>
</div>
<p>Notice the Back End Bound reduced from 68.9 to 38.5 – 1.8x speedup.</p>
<p>Additionally, let’s profile with PyTorch Profiler.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/131.png"><img alt="../_images/131.png" src="../_images/131.png" style="width: 100%;" /></a>
</div>
<p>Notice the CPU time reduced from 851 us to 310 us – 2.7X speedup.</p>
</div>
</div>
<div class="section" id="graph-optimization">
<h4>Graph Optimization<a class="headerlink" href="#graph-optimization" title="이 제목에 대한 퍼머링크">¶</a></h4>
<p>It is highly recommended for users to take advantage of Intel® Extension for PyTorch* with <a class="reference external" href="https://pytorch.org/docs/stable/jit.html">TorchScript</a> for further graph optimizations. To optimize performance further with TorchScript, Intel® Extension for PyTorch* supports oneDNN fusion of frequently used FP32/BF16 operator patterns, like Conv2D+ReLU, Linear+ReLU, and more to reduce operator/kernel invocation overheads, and for better cache locality. Some operator fusions allow to maintain temporary calculations, data type conversions, data layouts for better cache locality. As well as for INT8, Intel® Extension for PyTorch* has built-in quantization recipes to deliver good statistical accuracy for popular DL workloads including CNN, NLP and recommendation models. The quantized model is then optimized with oneDNN fusion support.</p>
<div class="section" id="id3">
<h5>Exercise<a class="headerlink" href="#id3" title="이 제목에 대한 퍼머링크">¶</a></h5>
<p>Let’s profile FP32 graph optimization with TorchScript.</p>
<p>As in the previous exercises, we will bind the workload to physical cores of the first socket.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>

<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
        <span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>

    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">x</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>

<span class="c1">#################### code changes ####################</span>
<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="c1">######################################################</span>

<span class="c1"># torchscript</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
    <span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">trace</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
    <span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">freeze</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>Let’s collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/141.png"><img alt="../_images/141.png" src="../_images/141.png" style="width: 100%;" /></a>
</div>
<p>Notice the Back End Bound reduced from 67.1 to 37.5 – 1.8x speedup.</p>
<p>Additionally, let’s profile with PyTorch Profiler.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/151.png"><img alt="../_images/151.png" src="../_images/151.png" style="width: 100%;" /></a>
</div>
<p>Notice that with Intel® Extension for PyTorch*  Conv + ReLU operators are fused, and the CPU time reduced from 803 us to 248 us – 3.2X speedup. The oneDNN eltwise post-op enables fusing a primitive with an elementwise primitive. This is one of the most popular kinds of fusion: an eltwise (typically an activation function such as ReLU) with preceding convolution or inner product. Have a look at the oneDNN verbose log shown in the next section.</p>
</div>
</div>
<div class="section" id="channels-last-memory-format">
<h4>Channels Last Memory Format<a class="headerlink" href="#channels-last-memory-format" title="이 제목에 대한 퍼머링크">¶</a></h4>
<p>When invoking <em>ipex.optimize</em> on model, Intel® Extension for PyTorch* automatically converts the model to optimized memory format, channels last. Channels last is a memory format that is more friendly to Intel Architecture. Compared to PyTorch default channels first NCHW (batch, channels, height, width) memory format, channels last NHWC (batch, height, width, channels) memory format generally accelerates convolutional neural networks with better cache locality.</p>
<p>One thing to note is that it is expensive to convert memory format. So it’s better to convert the memory format prior to deployment once, and keep the memory format conversion minimum during deployment. As the data propagates through model’s layers the channels last memory format is preserved through consecutive channels last supported layers (for example, Conv2d -&gt; ReLU -&gt; Conv2d) and conversions are only made in between channels last unsupported layers. See <a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/programming-model/memory-format-propagation.html">Memory Format Propagation</a> for more details.</p>
<div class="section" id="id4">
<h5>Exercise<a class="headerlink" href="#id4" title="이 제목에 대한 퍼머링크">¶</a></h5>
<p>Let’s demonstrate channels last optimization.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>

<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
        <span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>

    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="k">return</span> <span class="n">x</span>

<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>

<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="c1">############################### code changes ###############################</span>
<span class="n">ipex</span><span class="o">.</span><span class="n">disable_auto_channels_last</span><span class="p">()</span> <span class="c1"># omit this line for channels_last (default)</span>
<span class="c1">############################################################################</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>

<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
    <span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">trace</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
    <span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">freeze</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>We will use <a class="reference external" href="https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html">oneDNN verbose mode</a>, a tool to help collect information at oneDNN graph level such as operator fusions, kernel execution time spent on executing oneDNN primitives. For more information, refer to the <a class="reference external" href="https://oneapi-src.github.io/oneDNN/index.html">oneDNN Documentation</a>.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/161.png"><img alt="../_images/161.png" src="../_images/161.png" style="width: 15%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/171.png"><img alt="../_images/171.png" src="../_images/171.png" style="width: 100%;" /></a>
</div>
<p>Above is oneDNN verbose from channels first. We can verify that there are reorders from weight and data, then do computation, and finally reorder output back.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/181.png"><img alt="../_images/181.png" src="../_images/181.png" style="width: 80%;" /></a>
</div>
<p>Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders.</p>
</div>
</div>
</div>
<div class="section" id="performance-boost-with-intel-extension-for-pytorch">
<h3>Performance Boost with Intel® Extension for PyTorch*<a class="headerlink" href="#performance-boost-with-intel-extension-for-pytorch" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>Below summarizes performance boost of TorchServe with Intel® Extension for PyTorch* for ResNet50 and BERT-base-uncased.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/191.png"><img alt="../_images/191.png" src="../_images/191.png" style="width: 100%;" /></a>
</div>
</div>
<div class="section" id="id5">
<h3>Exercise with TorchServe<a class="headerlink" href="#id5" title="이 제목에 대한 퍼머링크">¶</a></h3>
<p>Let’s profile Intel® Extension for PyTorch* optimizations with TorchServe.</p>
<p>We will use <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench">TorchServe apache-bench benchmarking</a> with ResNet50 FP32 TorchScript, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">default parameters</a>.</p>
<p>As in the previous exercise, we will use the launcher to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">config.properties</a>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span>
</pre></div>
</div>
<p>Let’s collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/20.png"><img alt="../_images/20.png" src="../_images/20.png" style="width: 100%;" /></a>
</div>
<p>Level-1 TMA shows that both are bounded by the backend. As discussed earlier, the majority of untuned deep learning workloads will be Back End Bound. Notice the Back End Bound reduced from 70.0 to 54.1. Let’s go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/211.png"><img alt="../_images/211.png" src="../_images/211.png" style="width: 100%;" /></a>
</div>
<p>As discussed earlier, Back End Bound has two submetrics – Memory Bound and Core Bound. Memory Bound indicates the workload is under-optimized or under-utilized, and ideally memory-bound operations can be improved to core-bound by optimizing the OPs and improving cache locality. Level-2 TMA shows that the Back End Bound improved from Memory Bound to Core Bound. Let’s go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/221.png"><img alt="../_images/221.png" src="../_images/221.png" style="width: 100%;" /></a>
</div>
<p>Scaling deep learning models for production on a model serving framework like TorchServe requires high compute utilization. This requires that data is available through prefetching and reusing the data in cache when the execution units need it to execute the uOps. Level-3 TMA shows that the Back End Memory Bound improved from DRAM Bound to Core Bound.</p>
<p>As in the previous exercise with TorchServe, let’s use Intel® VTune Profiler ITT to annotate <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py#L188">TorchServe inference scope</a> to profile at inference-level granularity.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/231.png"><img alt="../_images/231.png" src="../_images/231.png" style="width: 100%;" /></a>
</div>
<p>Each inference call is traced in the timeline graph. The duration of the last inference call decreased from 215.731 ms to 95.634 ms - 2.3x speedup.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/241.png"><img alt="../_images/241.png" src="../_images/241.png" style="width: 100%;" /></a>
</div>
<p>The timeline graph can be expanded to see op-level profiling results. Notice that Conv + ReLU has been fused, and the duration decreased from 6.393 ms + 1.731 ms to 3.408 ms - 2.4x speedup.</p>
</div>
</div>
<div class="section" id="conclusion">
<h2>Conclusion<a class="headerlink" href="#conclusion" title="이 제목에 대한 퍼머링크">¶</a></h2>
<p>In this tutorial, we have used Top-down Microarchitecture Analysis (TMA) and Intel® VTune™ Profiler’s Instrumentation and Tracing Technology (ITT) to demonstrate that</p>
<ul class="simple">
<li><p>Often the primary bottleneck of under-optimized or under-tuned deep learning workloads are Back End Bound, which has two submetrics, Memory Bound and Core Bound.</p></li>
<li><p>A more efficient memory allocator, operator fusion, memory layout format optimization by Intel® Extension for PyTorch* improve Memory Bound.</p></li>
<li><p>Key deep learning primitives, such as convolution, matrix multiplication, dot-product, etc have been well optimized by Intel® Extension for PyTorch* and oneDNN library, improving Core Bound.</p></li>
<li><p>Intel® Extension for PyTorch* has been integrated into TorchServe with an ease-of-use API.</p></li>
<li><p>TorchServe with Intel® Extension for PyTorch* shows 7.71x throughput speedup for ResNet50, and 2.20x throughput speedup for BERT.</p></li>
</ul>
</div>
<div class="section" id="related-readings">
<h2>Related Readings<a class="headerlink" href="#related-readings" title="이 제목에 대한 퍼머링크">¶</a></h2>
<p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html">Top-down Microarchitecture Analysis Method</a></p>
<p><a class="reference external" href="https://easyperf.net/blog/2019/02/09/Top-Down-performance-analysis-methodology">Top-Down performance analysis methodology</a></p>
<p><a class="reference external" href="https://medium.com/pytorch/accelerating-pytorch-with-intel-extension-for-pytorch-3aef51ea3722">Accelerating PyTorch with Intel® Extension for PyTorch*</a></p>
</div>
<div class="section" id="acknowledgement">
<h2>Acknowledgement<a class="headerlink" href="#acknowledgement" title="이 제목에 대한 퍼머링크">¶</a></h2>
<p>We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this tutorial. We would also like to thank Hamid Shojanazeri (Meta) and Li Ning (AWS) for their helpful feedback in code review and the tutorial.</p>
</div>
</div>


             </article>

            </div>
            <footer>

    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">

        <a href="nvfuser_intro_tutorial.html" class="btn btn-neutral float-right" title="Getting Started - Accelerate Your Scripts with nvFuser" accesskey="n" rel="next">Next <img src="../_static/images/chevron-right-orange.svg" class="next-page"></a>


        <a href="torchserve_with_ipex.html" class="btn btn-neutral" title="Grokking PyTorch Intel CPU performance from first principles" accesskey="p" rel="prev"><img src="../_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a>

    </div>


    <hr class="community-hr hr-top" />
    <div class="community-container">
      <div class="community-prompt">더 궁금하시거나 개선할 내용이 있으신가요? 커뮤니티에 참여해보세요!</div>
      <div class="community-link"><a href="https://discuss.pytorch.kr/" aria-label="PyTorchKoreaCommunity">한국어 커뮤니티 바로가기</a></div>
    </div>
    <hr class="community-hr hr-bottom"/>

    <hr class="rating-hr hr-top" />
    <div class="rating-container">
      <div class="rating-prompt">이 튜토리얼이 어떠셨나요? 평가해주시면 이후 개선에 참고하겠습니다! :)</div>
      <div class="stars-outer">
        <i class="far fa-star" title="1 Star" data-behavior="tutorial-rating" data-count="1"></i>
        <i class="far fa-star" title="2 Stars" data-behavior="tutorial-rating" data-count="2"></i>
        <i class="far fa-star" title="3 Stars" data-behavior="tutorial-rating" data-count="3"></i>
        <i class="far fa-star" title="4 Stars" data-behavior="tutorial-rating" data-count="4"></i>
        <i class="far fa-star" title="5 Stars" data-behavior="tutorial-rating" data-count="5"></i>
      </div>
    </div>
    <hr class="rating-hr hr-bottom"/>


  <div role="contentinfo">
    <p>
        &copy; Copyright 2018-2024, PyTorch &amp; 파이토치 한국 사용자 모임(PyTorch Korea User Group).

    </p>
  </div>

      <div>
        Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
      </div>


</footer>

          </div>
<script>
if((window.location.href.indexOf("/prototype/")!= -1) && (window.location.href.indexOf("/prototype/prototype_index")< 1))
  {
    var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-flask" aria-hidden="true">&nbsp</i> 이 튜토리얼은 프로토타입(prototype) 기능들에 대해서 설명하고 있습니다. 프로토타입 기능은 일반적으로 피드백 및 테스트용으로, 런타임 플래그 없이는 PyPI나 Conda로 배포되는 바이너리에서는 사용할 수 없습니다.</p></div>'
    document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
  }
</script>
        </div>

        <div class="pytorch-content-right" id="pytorch-content-right">
          <div class="pytorch-right-menu" id="pytorch-right-menu">
            <div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
              <ul>
<li><a class="reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles (Part 2)</a><ul>
<li><a class="reference internal" href="#prerequisites">Prerequisites</a><ul>
<li><a class="reference internal" href="#top-down-microarchitecture-analysis-method-tma">Top-down Microarchitecture Analysis Method (TMA)</a><ul>
<li><a class="reference internal" href="#tune-for-the-back-end-bound">Tune for the Back End Bound</a></li>
</ul>
</li>
<li><a class="reference internal" href="#intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt">Intel® VTune™ Profiler’s Instrumentation and Tracing Technology (ITT)</a></li>
</ul>
</li>
<li><a class="reference internal" href="#torchserve-with-intel-extension-for-pytorch">TorchServe with Intel® Extension for PyTorch*</a><ul>
<li><a class="reference internal" href="#leveraging-advanced-launcher-configuration-memory-allocator">Leveraging Advanced Launcher Configuration: Memory Allocator</a><ul>
<li><a class="reference internal" href="#tcmalloc-jemalloc-ptmalloc">TCMalloc, JeMalloc, PTMalloc</a><ul>
<li><a class="reference internal" href="#exercise">Exercise</a></li>
<li><a class="reference internal" href="#exercise-with-torchserve">Exercise with TorchServe</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#id1">Intel® Extension for PyTorch*</a><ul>
<li><a class="reference internal" href="#operator-optimization">Operator Optimization</a><ul>
<li><a class="reference internal" href="#id2">Exercise</a></li>
</ul>
</li>
<li><a class="reference internal" href="#graph-optimization">Graph Optimization</a><ul>
<li><a class="reference internal" href="#id3">Exercise</a></li>
</ul>
</li>
<li><a class="reference internal" href="#channels-last-memory-format">Channels Last Memory Format</a><ul>
<li><a class="reference internal" href="#id4">Exercise</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#performance-boost-with-intel-extension-for-pytorch">Performance Boost with Intel® Extension for PyTorch*</a></li>
<li><a class="reference internal" href="#id5">Exercise with TorchServe</a></li>
</ul>
</li>
<li><a class="reference internal" href="#conclusion">Conclusion</a></li>
<li><a class="reference internal" href="#related-readings">Related Readings</a></li>
<li><a class="reference internal" href="#acknowledgement">Acknowledgement</a></li>
</ul>
</li>
</ul>

            </div>
          </div>
        </div>
      </section>
    </div>


       <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
         <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
         <script src="../_static/jquery.js"></script>
         <script src="../_static/underscore.js"></script>
         <script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
         <script src="../_static/doctools.js"></script>
         <script src="../_static/clipboard.min.js"></script>
         <script src="../_static/copybutton.js"></script>
         <script src="../_static/translations.js"></script>
         <script src="../_static/katex.min.js"></script>
         <script src="../_static/auto-render.min.js"></script>
         <script src="../_static/katex_autorenderer.js"></script>
         <script src="../_static/design-tabs.js"></script>


  <script type="text/javascript" src="../_static/js/vendor/popper.min.js"></script>
  <script type="text/javascript" src="../_static/js/vendor/bootstrap.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script>
  <script type="text/javascript" src="../_static/js/theme.js"></script>

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

<script>
// Helper function to make it easier to call dataLayer.push()
function gtag(){window.dataLayer.push(arguments);}

//add microsoft link
if(window.location.href.indexOf("/beginner/basics/")!= -1)
{
  var url="https://docs.microsoft.com/learn/paths/pytorch-fundamentals/?wt.mc_id=aiml-7486-cxa";
  switch(window.location.pathname.split("/").pop().replace('.html',''))
  {
    case"quickstart_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/9-quickstart?WT.mc_id=aiml-7486-cxa";
      break;
    case"tensorqs_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/2-tensors?WT.mc_id=aiml-7486-cxa";
      break;
    case"data_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/3-data?WT.mc_id=aiml-7486-cxa";
      break;
    case"transforms_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/4-transforms?WT.mc_id=aiml-7486-cxa";
      break;
    case"buildmodel_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/5-model?WT.mc_id=aiml-7486-cxa";
      break;
    case"autogradqs_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/6-autograd?WT.mc_id=aiml-7486-cxa";
      break;
    case"optimization_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/7-optimization?WT.mc_id=aiml-7486-cxa";
      break;
    case"saveloadrun_tutorial":
      url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/8-inference?WT.mc_id=aiml-7486-cxa";
    }

    $(".pytorch-call-to-action-links").children().first().before("<a href="+url+' data-behavior="call-to-action-event" data-response="Run in Microsoft Learn" target="_blank"><div id="microsoft-learn-link" style="padding-bottom: 0.625rem;border-bottom: 1px solid #f3f4f7;padding-right: 2.5rem;display: -webkit-box;  display: -ms-flexbox; display: flex; -webkit-box-align: center;-ms-flex-align: center;align-items: center;"><img class="call-to-action-img" src="../../_static/images/microsoft-logo.svg"/><div class="call-to-action-desktop-view">Run in Microsoft Learn</div><div class="call-to-action-mobile-view">Learn</div></div></a>')
  }
</script>

<script async src="https://www.googletagmanager.com/gtag/js?id=G-LZRD6GXDLF"></script>
<script data-cfasync="false">
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());
  gtag('config', 'G-LZRD6GXDLF');   // GA4
  gtag('config', 'UA-71919972-3');  // UA
</script>


<script data-cfasync="false">
  $("[data-behavior='call-to-action-event']").on('click', function(){
    ga('send', {
      hitType: 'event',
      eventCategory: $(this).attr("data-response"),
      eventAction: 'click',
      eventLabel: window.location.href
    });

    gtag('event', 'click', {
      'event_category': $(this).attr("data-response"),
      'event_label': $("h1").first().text(),
      'tutorial_link': window.location.href
    });
   });

   $("[data-behavior='tutorial-rating']").on('click', function(){
    gtag('event', 'click', {
      'event_category': 'Tutorial Rating',
      'event_label': $("h1").first().text(),
      'value': $(this).attr("data-count"),
      'customEvent:Rating': $(this).attr("data-count") // send to GA custom dimension customEvent:Rating.
    });
   });

   if (location.pathname == "/") {
     $(".rating-container").hide();
     $(".hr-bottom").hide();
   }
</script>

<script type="text/javascript">
  var collapsedSections = ['파이토치(PyTorch) 레시피', '파이토치(PyTorch) 배우기', 'Introduction to PyTorch on YouTube', '이미지/비디오', '오디오', '텍스트', '백엔드', '강화학습', 'PyTorch 모델을 프로덕션 환경에 배포하기', 'PyTorch 프로파일링', 'Code Transforms with FX', '프론트엔드 API', 'PyTorch 확장하기', '모델 최적화', '병렬 및 분산 학습', 'Edge with ExecuTorch', '추천 시스템', 'Multimodality'];
</script>


  <!-- Begin Footer -->

  <div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources">
    <div class="container">
      <div class="row">
        <div class="col-md-4 text-center">
          <h2>PyTorchKorea @ GitHub</h2>
          <p>파이토치 한국 사용자 모임을 GitHub에서 만나보세요.</p>
          <a class="with-right-arrow" href="https://github.com/PyTorchKorea" target="_blank">GitHub로 이동</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>한국어 튜토리얼</h2>
          <p>한국어로 번역 중인 PyTorch 튜토리얼입니다.</p>
          <a class="with-right-arrow" href="https://tutorials.pytorch.kr/">튜토리얼로 이동</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>커뮤니티</h2>
          <p>다른 사용자들과 의견을 나누고, 도와주세요!</p>
          <a class="with-right-arrow" href="https://discuss.pytorch.kr/">커뮤니티로 이동</a>
        </div>
      </div>
    </div>
  </div>

  <footer class="site-footer">
    <div class="container footer-container">
      <div class="footer-logo-wrapper">
        <a href="https://pytorch.kr/" class="footer-logo"></a>
      </div>

      <div class="footer-links-wrapper">
        <div class="footer-links-col">
          <ul>
            <li class="list-title"><a href="https://pytorch.kr/">파이토치 한국 사용자 모임</a></li>
            <li><a href="https://pytorch.kr//about">사용자 모임 소개</a></li>
            <li><a href="https://pytorch.kr//about/contributors">기여해주신 분들</a></li>
            <li><a href="https://pytorch.kr//resources">리소스</a></li>
            <li><a href="https://pytorch.kr//coc">행동 강령</a></li>
          </ul>
        </div>
      </div>
      <div class="trademark-disclaimer">
        <ul>
          <li>이 사이트는 독립적인 파이토치 사용자 커뮤니티로, 최신 버전이 아니거나 잘못된 내용이 포함되어 있을 수 있습니다. This site is an independent user community and may be out of date or contain incorrect information.</li>
          <li><a href="https://pytorch.kr/coc">행동 강령</a>을 읽고 지켜주세요. PyTorch 공식 로고 사용 정책은 <a href="https://www.linuxfoundation.org/policies/">Linux Foundation의 정책</a>을 따릅니다. Please read and follow <a href="https://pytorch.kr/coc">our code of conduct</a>. All PyTorch trademark policy applicable to <a href="https://www.linuxfoundation.org/policies/">Linux Foundation's policies</a>.</li>
        </ul>
      </div>
    </div>
  </footer>

  <!-- End Footer -->

  <!-- Begin Mobile Menu -->

  <div class="mobile-main-menu">
    <div class="container-fluid">
      <div class="container">
        <div class="mobile-main-menu-header-container">
          <a class="header-logo" href="https://pytorch.kr/" aria-label="PyTorch"></a>
          <a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a>
        </div>
      </div>
    </div>

    <div class="mobile-main-menu-links-container">
      <div class="main-menu">
        <ul>
          <li>
            <a href="https://pytorch.kr/get-started">시작하기</a>
          </li>

          <li class="active">
            <a href="https://tutorials.pytorch.kr/">튜토리얼</a>
          </li>

          <li>
            <a href="https://pytorch.kr/hub">허브</a>
          </li>

          <li>
            <a href="https://discuss.pytorch.kr/">커뮤니티</a>
          </li>
        </ul>
      </div>
    </div>
  </div>

  <!-- End Mobile Menu -->

  <script type="text/javascript" src="../_static/js/vendor/anchor.min.js"></script>

  <script type="text/javascript">
    $(document).ready(function() {
      mobileMenu.bind();
      mobileTOC.bind();
      pytorchAnchors.bind();
      sideMenus.bind();
      scrollToAnchor.bind();
      highlightNavigation.bind();
      mainMenuDropdown.bind();
      filterTags.bind();

      // Add class to links that have code blocks, since we cannot create links in code blocks
      $("article.pytorch-article a span.pre").each(function(e) {
        $(this).closest("a").addClass("has-code");
      });
    })
  </script>
</body>
</html>