-
Notifications
You must be signed in to change notification settings - Fork 257
/
Copy pathtorchserve_with_ipex_2.html
1171 lines (981 loc) ยท 92.9 KB
/
torchserve_with_ipex_2.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="ko" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="ko" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta property="og:title" content="Grokking PyTorch Intel CPU performance from first principles (Part 2)" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://tutorials.pytorch.kr/intermediate/torchserve_with_ipex_2.html" />
<meta property="og:site_name" content="PyTorch Tutorials KR" />
<meta property="og:description" content="Authors: Min Jean Cho, Jing Xu, Mark Saroufim In the Grokking PyTorch Intel CPU Performance From First Principles tutorial , we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into TorchServe for optimized CPU performance. In this tutorial, w..." />
<meta property="og:image" content="https://tutorials.pytorch.kr/_static/logos/logo-kr-sm-dark.png" />
<meta property="og:image:alt" content="PyTorch Tutorials KR" />
<meta name="description" content="Authors: Min Jean Cho, Jing Xu, Mark Saroufim In the Grokking PyTorch Intel CPU Performance From First Principles tutorial , we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into TorchServe for optimized CPU performance. In this tutorial, w..." />
<meta property="og:ignore_canonical" content="true" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Grokking PyTorch Intel CPU performance from first principles (Part 2) — ํ์ดํ ์น ํ๊ตญ์ด ํํ ๋ฆฌ์ผ (PyTorch tutorials in Korean)</title>
<link rel="shortcut icon" href="../_static/favicon.ico"/>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<!-- <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> -->
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" type="text/css" />
<link rel="stylesheet" href="../_static/katex-math.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../_static/sphinx-design.5ea377869091fd0449014c60fc090103.min.css" type="text/css" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/custom2.css" type="text/css" />
<link rel="index" title="์์ธ" href="../genindex.html" />
<link rel="search" title="๊ฒ์" href="../search.html" />
<link rel="next" title="Getting Started - Accelerate Your Scripts with nvFuser" href="nvfuser_intro_tutorial.html" />
<link rel="prev" title="Grokking PyTorch Intel CPU performance from first principles" href="torchserve_with_ipex.html" />
<script src="../_static/js/modernizr.min.js"></script>
<!-- Preload the theme fonts -->
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<!-- Preload the katex fonts -->
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.2/css/all.css" integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>
<div class="container-fluid header-holder tutorials-header" id="header-holder">
<div class="container">
<div class="header-container">
<a class="header-logo" href="https://pytorch.kr/" aria-label="PyTorch"></a>
<div class="main-menu">
<ul>
<li>
<a href="https://pytorch.kr/get-started">์์ํ๊ธฐ</a>
</li>
<li class="active">
<a href="https://tutorials.pytorch.kr/">ํํ ๋ฆฌ์ผ</a>
</li>
<li>
<a href="https://pytorch.kr/hub">ํ๋ธ</a>
</li>
<li>
<a href="https://discuss.pytorch.kr/">์ปค๋ฎค๋ํฐ</a>
</li>
</ul>
</div>
<a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
</div>
</div>
</div>
<body class="pytorch-body">
<div class="table-of-contents-link-wrapper">
<span>Table of Contents</span>
<a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
</div>
<nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
<div class="pytorch-side-scroll">
<div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<div class="pytorch-left-menu-search">
<div class="version">
2.3.1+cu121
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search Tutorials" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ๋ ์ํผ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../recipes/recipes_index.html">๋ชจ๋ ๋ ์ํผ ๋ณด๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prototype/prototype_index.html">๋ชจ๋ ํ๋กํ ํ์
๋ ์ํผ ๋ณด๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ์์ํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/intro.html">ํ์ดํ ์น(PyTorch) ๊ธฐ๋ณธ ์ตํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/quickstart_tutorial.html">๋น ๋ฅธ ์์(Quickstart)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/tensorqs_tutorial.html">ํ
์(Tensor)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/data_tutorial.html">Dataset๊ณผ DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/transforms_tutorial.html">๋ณํ(Transform)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/buildmodel_tutorial.html">์ ๊ฒฝ๋ง ๋ชจ๋ธ ๊ตฌ์ฑํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/autogradqs_tutorial.html"><code class="docutils literal notranslate"><span class="pre">torch.autograd</span></code>๋ฅผ ์ฌ์ฉํ ์๋ ๋ฏธ๋ถ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/optimization_tutorial.html">๋ชจ๋ธ ๋งค๊ฐ๋ณ์ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/saveloadrun_tutorial.html">๋ชจ๋ธ ์ ์ฅํ๊ณ ๋ถ๋ฌ์ค๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Introduction to PyTorch on YouTube</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt.html">PyTorch ์๊ฐ - YouTube ์๋ฆฌ์ฆ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/introyt1_tutorial.html">PyTorch ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensors_deeper_tutorial.html">Pytorch Tensor ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/autogradyt_tutorial.html">The Fundamentals of Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/modelsyt_tutorial.html">Building Models with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensorboardyt_tutorial.html">PyTorch TensorBoard Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/trainingyt.html">Training with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/captumyt.html">Model Understanding with Captum</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ๋ฐฐ์ฐ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/deep_learning_60min_blitz.html">PyTorch๋ก ๋ฅ๋ฌ๋ํ๊ธฐ: 60๋ถ๋ง์ ๋์ฅ๋ด๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/pytorch_with_examples.html">์์ ๋ก ๋ฐฐ์ฐ๋ ํ์ดํ ์น(PyTorch)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/nn_tutorial.html"><cite>torch.nn</cite> ์ด <em>์ค์ ๋ก</em> ๋ฌด์์ธ๊ฐ์?</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_tutorial.html">TensorBoard๋ก ๋ชจ๋ธ, ๋ฐ์ดํฐ, ํ์ต ์๊ฐํํ๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ด๋ฏธ์ง/๋น๋์ค</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchvision_tutorial.html">TorchVision Object Detection Finetuning Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/transfer_learning_tutorial.html">์ปดํจํฐ ๋น์ (Vision)์ ์ํ ์ ์ดํ์ต(Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/fgsm_tutorial.html">์ ๋์ ์์ ์์ฑ(Adversarial Example Generation)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dcgan_faces_tutorial.html">DCGAN ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">๋ฐฐํฌ๋ฅผ ์ํด ๋น์ ํธ๋์คํฌ๋จธ(Vision Transformer) ๋ชจ๋ธ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="tiatoolbox_tutorial.html">Whole Slide Image Classification Using PyTorch and TIAToolbox</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ค๋์ค</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_io_tutorial.html">Audio I/O</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_resampling_tutorial.html">Audio Resampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_data_augmentation_tutorial.html">Audio Data Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_extractions_tutorial.html">Audio Feature Extractions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_augmentation_tutorial.html">Audio Feature Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_datasets_tutorial.html">Audio Datasets</a></li>
<li class="toctree-l1"><a class="reference internal" href="speech_recognition_pipeline_tutorial.html">Speech Recognition with Wav2Vec2</a></li>
<li class="toctree-l1"><a class="reference internal" href="text_to_speech_with_torchaudio.html">Text-to-speech with Tacotron2</a></li>
<li class="toctree-l1"><a class="reference internal" href="forced_alignment_with_torchaudio_tutorial.html">wav2vec2์ ์ด์ฉํ ๊ฐ์ ์ ๋ ฌ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ
์คํธ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/bettertransformer_tutorial.html">Fast Transformer Inference with Better Transformer</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_classification_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: ๋ฌธ์-๋จ์ RNN์ผ๋ก ์ด๋ฆ ๋ถ๋ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_generation_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: ๋ฌธ์-๋จ์ RNN์ผ๋ก ์ด๋ฆ ์์ฑํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="seq2seq_translation_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: Sequence to Sequence ๋คํธ์ํฌ์ Attention์ ์ด์ฉํ ๋ฒ์ญ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/text_sentiment_ngrams_tutorial.html">torchtext ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ก ํ
์คํธ ๋ถ๋ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/translation_transformer.html"><code class="docutils literal notranslate"><span class="pre">nn.Transformer</span></code> ์ torchtext๋ก ์ธ์ด ๋ฒ์ญํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/torchtext_custom_dataset_tutorial.html">Preprocess custom text dataset using Torchtext</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ฐฑ์๋</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๊ฐํํ์ต</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_q_learning.html">๊ฐํ ํ์ต (DQN) ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_ppo.html">Reinforcement Learning (PPO) with TorchRL Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="mario_rl_tutorial.html">๋ง๋ฆฌ์ค ๊ฒ์ RL ์์ด์ ํธ๋ก ํ์ตํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/pendulum.html">Pendulum: Writing your environment and transforms with TorchRL</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ๋ชจ๋ธ์ ํ๋ก๋์
ํ๊ฒฝ์ ๋ฐฐํฌํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
<li class="toctree-l1"><a class="reference internal" href="flask_rest_api_tutorial.html">Flask๋ฅผ ์ฌ์ฉํ์ฌ Python์์ PyTorch๋ฅผ REST API๋ก ๋ฐฐํฌํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/Intro_to_TorchScript_tutorial.html">TorchScript ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_export.html">C++์์ TorchScript ๋ชจ๋ธ ๋ก๋ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/super_resolution_with_onnxruntime.html">(์ ํ) PyTorch ๋ชจ๋ธ์ ONNX์ผ๋ก ๋ณํํ๊ณ ONNX ๋ฐํ์์์ ์คํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="realtime_rpi.html">Raspberry Pi 4 ์์ ์ค์๊ฐ ์ถ๋ก (Inference) (30fps!)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ํ๋กํ์ผ๋ง</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch ๋ชจ๋ ํ๋กํ์ผ๋งํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_intro_tutorial.html">Introduction to Holistic Trace Analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_trace_diff_tutorial.html">Trace Diff using Holistic Trace Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Code Transforms with FX</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="fx_conv_bn_fuser.html">(๋ฒ ํ) FX์์ ํฉ์ฑ๊ณฑ/๋ฐฐ์น ์ ๊ทํ(Convolution/Batch Norm) ๊ฒฐํฉ๊ธฐ(Fuser) ๋ง๋ค๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx_profiling_tutorial.html">(beta) Building a Simple CPU Performance Profiler with FX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ๋ก ํธ์๋ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="memory_format_tutorial.html">(๋ฒ ํ) PyTorch๋ฅผ ์ฌ์ฉํ Channels Last ๋ฉ๋ชจ๋ฆฌ ํ์</a></li>
<li class="toctree-l1"><a class="reference internal" href="forward_ad_usage.html">Forward-mode Automatic Differentiation (Beta)</a></li>
<li class="toctree-l1"><a class="reference internal" href="jacobians_hessians.html">Jacobians, Hessians, hvp, vhp, and more: composing function transforms</a></li>
<li class="toctree-l1"><a class="reference internal" href="ensembling.html">๋ชจ๋ธ ์์๋ธ</a></li>
<li class="toctree-l1"><a class="reference internal" href="per_sample_grads.html">Per-sample-gradients</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_frontend.html">PyTorch C++ ํ๋ก ํธ์๋ ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch-script-parallelism.html">TorchScript์ ๋์ ๋ณ๋ ฌ ์ฒ๋ฆฌ(Dynamic Parallelism)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_autograd.html">C++ ํ๋ก ํธ์๋์ ์๋ ๋ฏธ๋ถ (autograd)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ํ์ฅํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="custom_function_double_backward_tutorial.html">Double Backward with Custom Functions</a></li>
<li class="toctree-l1"><a class="reference internal" href="custom_function_conv_bn_tutorial.html">Fusing Convolution and Batch Norm using Custom Function</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_extension.html">Custom C++ and CUDA Extensions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_ops.html">Extending TorchScript with Custom C++ Operators</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_classes.html">์ปค์คํ
C++ ํด๋์ค๋ก TorchScript ํ์ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dispatcher.html">Registering a Dispatched Operator in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/extend_dispatcher.html">Extending dispatcher for a new backend in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/privateuseone.html">Facilitating New Backend Integration by PrivateUse1</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ชจ๋ธ ์ต์ ํ</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch ๋ชจ๋ ํ๋กํ์ผ๋งํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_profiler_tutorial.html">ํ
์๋ณด๋๋ฅผ ์ด์ฉํ ํ์ดํ ์น ํ๋กํ์ผ๋ฌ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hyperparameter_tuning_tutorial.html">Ray Tune์ ์ฌ์ฉํ ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">๋ฐฐํฌ๋ฅผ ์ํด ๋น์ ํธ๋์คํฌ๋จธ(Vision Transformer) ๋ชจ๋ธ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="parametrizations.html">Parametrizations Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="pruning_tutorial.html">๊ฐ์ง์น๊ธฐ ๊ธฐ๋ฒ(Pruning) ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dynamic_quantization_tutorial.html">(๋ฒ ํ) LSTM ๊ธฐ๋ฐ ๋จ์ด ๋จ์ ์ธ์ด ๋ชจ๋ธ์ ๋์ ์์ํ</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamic_quantization_bert_tutorial.html">(๋ฒ ํ) BERT ๋ชจ๋ธ ๋์ ์์ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantized_transfer_learning_tutorial.html">(๋ฒ ํ) ์ปดํจํฐ ๋น์ ํํ ๋ฆฌ์ผ์ ์ํ ์์ํ๋ ์ ์ดํ์ต(Quantized Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/static_quantization_tutorial.html">(๋ฒ ํ) PyTorch์์ Eager Mode๋ฅผ ์ด์ฉํ ์ ์ ์์ํ</a></li>
<li class="toctree-l1"><a class="reference internal" href="torchserve_with_ipex.html">Grokking PyTorch Intel CPU performance from first principles</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles (Part 2)</a></li>
<li class="toctree-l1"><a class="reference internal" href="nvfuser_intro_tutorial.html">Getting Started - Accelerate Your Scripts with nvFuser</a></li>
<li class="toctree-l1"><a class="reference internal" href="ax_multiobjective_nas_tutorial.html">Multi-Objective NAS with Ax</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_compile_tutorial.html">Introduction to <code class="docutils literal notranslate"><span class="pre">torch.compile</span></code></a></li>
<li class="toctree-l1"><a class="reference internal" href="inductor_debug_cpu.html">Inductor CPU backend debugging and profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html">(Beta) Scaled Dot Product Attention (SDPA)๋ก ๊ณ ์ฑ๋ฅ ํธ๋์คํฌ๋จธ(Transformers) ๊ตฌํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#torch-compile-sdpa"><code class="docutils literal notranslate"><span class="pre">torch.compile</span></code> ๊ณผ ํจ๊ป SDPA ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#sdpa-atteition-bias">SDPA๋ฅผ <code class="docutils literal notranslate"><span class="pre">atteition.bias</span></code> ํ์ ํด๋์ค์ ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#id8">๊ฒฐ๋ก </a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/knowledge_distillation_tutorial.html">Knowledge Distillation Tutorial</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ณ๋ ฌ ๋ฐ ๋ถ์ฐ ํ์ต</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../distributed/home.html">Distributed and Parallel Training Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dist_overview.html">PyTorch Distributed Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/ddp_series_intro.html">Distributed Data Parallel in PyTorch - Video Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_parallel_tutorial.html">๋จ์ผ ๋จธ์ ์ ์ฌ์ฉํ ๋ชจ๋ธ ๋ณ๋ ฌํ ๋ชจ๋ฒ ์ฌ๋ก</a></li>
<li class="toctree-l1"><a class="reference internal" href="ddp_tutorial.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์์ํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_tuto.html">PyTorch๋ก ๋ถ์ฐ ์ดํ๋ฆฌ์ผ์ด์
๊ฐ๋ฐํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_tutorial.html">Getting Started with Fully Sharded Data Parallel(FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_adavnced_tutorial.html">Advanced Model Training with Fully Sharded Data Parallel (FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="TP_tutorial.html">Large Scale Transformer model training with Tensor Parallel (TP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="process_group_cpp_extension_tutorial.html">Cpp ํ์ฅ์ ์ฌ์ฉํ ํ๋ก์ธ์ค ๊ทธ๋ฃน ๋ฐฑ์๋ ์ฌ์ฉ์ ์ ์</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_tutorial.html">Getting Started with Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_param_server_tutorial.html">Implementing a Parameter Server Using Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_pipeline_parallel_tutorial.html">Distributed Pipeline Parallelism Using RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_async_execution.html">Implementing Batch RPC Processing Using Asynchronous Executions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/rpc_ddp_tutorial.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ(DDP)๊ณผ ๋ถ์ฐ RPC ํ๋ ์์ํฌ ๊ฒฐํฉ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/ddp_pipeline.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ ์ฒ๋ฆฌ์ ๋ณ๋ ฌ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ์ ์ฌ์ฉํ ํธ๋์คํฌ๋จธ ๋ชจ๋ธ ํ์ต</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/generic_join.html">Distributed Training with Uneven Inputs Using the Join Context Manager</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Edge with ExecuTorch</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html">Exporting to ExecuTorch Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href=" https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html">Running an ExecuTorch Model in C++ Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/sdk-integration-tutorial.html">Using the ExecuTorch SDK to Profile a Model</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-ios.html">Building an ExecuTorch iOS Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-android.html">Building an ExecuTorch Android Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/examples-end-to-end-to-lower-model-to-delegate.html">Lowering a Model as a Delegate</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ถ์ฒ ์์คํ
</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchrec_tutorial.html">TorchRec ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/sharding.html">Exploring TorchRec sharding</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Multimodality</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/flava_finetuning_tutorial.html">TorchMultimodal ํํ ๋ฆฌ์ผ: FLAVA ๋ฏธ์ธ์กฐ์ </a></li>
</ul>
</div>
</div>
</nav>
<div class="pytorch-container">
<div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
<div class="pytorch-breadcrumbs-wrapper">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="pytorch-breadcrumbs">
<li>
<a href="../index.html">
Tutorials
</a> >
</li>
<li>Grokking PyTorch Intel CPU performance from first principles (Part 2)</li>
<li class="pytorch-breadcrumbs-aside">
<a href="../_sources/intermediate/torchserve_with_ipex_2.rst.txt" rel="nofollow"><img src="../_static/images/view-page-source-icon.svg"></a>
</li>
</ul>
</div>
</div>
<div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
Shortcuts
</div>
</div>
<section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
<div class="pytorch-content-left">
<div class="pytorch-call-to-action-links">
<div id="tutorial-type">intermediate/torchserve_with_ipex_2</div>
<div id="google-colab-link">
<img class="call-to-action-img" src="../_static/images/pytorch-colab.svg"/>
<div class="call-to-action-desktop-view">Run in Google Colab</div>
<div class="call-to-action-mobile-view">Colab</div>
</div>
<div id="download-notebook-link">
<img class="call-to-action-notebook-img" src="../_static/images/pytorch-download.svg"/>
<div class="call-to-action-desktop-view">Download Notebook</div>
<div class="call-to-action-mobile-view">Notebook</div>
</div>
<div id="github-view-link">
<img class="call-to-action-img" src="../_static/images/pytorch-github.svg"/>
<div class="call-to-action-desktop-view">View on GitHub</div>
<div class="call-to-action-mobile-view">GitHub</div>
</div>
</div>
<div class="rst-content">
<div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
<article itemprop="articleBody" id="pytorch-article" class="pytorch-article">
<div class="section" id="grokking-pytorch-intel-cpu-performance-from-first-principles-part-2">
<h1>Grokking PyTorch Intel CPU performance from first principles (Part 2)<a class="headerlink" href="#grokking-pytorch-intel-cpu-performance-from-first-principles-part-2" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h1>
<p>Authors: <a class="reference external" href="https://github.com/min-jean-cho">Min Jean Cho</a>, <a class="reference external" href="https://github.com/jingxu10">Jing Xu</a>, <a class="reference external" href="https://github.com/msaroufim">Mark Saroufim</a></p>
<p>In the <a class="reference external" href="https://tutorials.pytorch.kr/intermediate/torchserve_with_ipex.html">Grokking PyTorch Intel CPU Performance From First Principles</a> tutorial
, we have introduced how to tune CPU runtime configurations, how to profile them, and how to integrate them into <a class="reference external" href="https://github.com/pytorch/serve">TorchServe</a> for optimized CPU performance.</p>
<p>In this tutorial, we will demonstrate boosting performance with memory allocator via the <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md">Intelยฎ Extension for PyTorch* Launcher</a>
, and optimized kernels on CPU via <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intelยฎ Extension for PyTorch*</a>
, and apply them to TorchServe showcasing 7.71x throughput speedup for ResNet50 and 2.20x throughput speedup for BERT.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/1.png"><img alt="../_images/1.png" src="../_images/1.png" style="width: 100%;" /></a>
</div>
<div class="section" id="prerequisites">
<h2>Prerequisites<a class="headerlink" href="#prerequisites" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>Throughout this tutorial, we will use <a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html">Top-down Microarchitecture Analysis (TMA)</a> to profile and show that the Back End Bound (Memory Bound, Core Bound) is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intelยฎ Extension for PyTorch* for improving Back End Bound. We will use <a class="reference external" href="https://github.com/andikleen/pmu-tools/wiki/toplev-manual">toplev</a>, a tool part of <a class="reference external" href="https://github.com/andikleen/pmu-tools">pmu-tools</a> built on top of <a class="reference external" href="https://man7.org/linux/man-pages/man1/perf.1.html">Linux perf</a>, for TMA.</p>
<p>We will also use <a class="reference external" href="https://github.com/pytorch/pytorch/issues/41001">Intelยฎ VTuneโข Profilerโs Instrumentation and Tracing Technology (ITT)</a> to profile at finer granularity.</p>
<div class="section" id="top-down-microarchitecture-analysis-method-tma">
<h3>Top-down Microarchitecture Analysis Method (TMA)<a class="headerlink" href="#top-down-microarchitecture-analysis-method-tma" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>When tuning CPU for optimal performance, itโs useful to know where the bottleneck is. Most CPU cores have on-chip Performance Monitoring Units (PMUs). PMUs are dedicated pieces of logic within a CPU core that count specific hardware events as they occur on the system. Examples of these events may be Cache Misses or Branch Mispredictions. PMUs are used for Top-down Microarchitecture Analysis (TMA) to identify the bottlenecks. TMA consists of hierarchical levels as shown:</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/26.png"><img alt="../_images/26.png" src="../_images/26.png" style="width: 100%;" /></a>
</div>
<p>The top level, level-1, metrics collect <em>Retiring</em>, <em>Bad Speculation</em>, <em>Front End Bound</em>, <em>Back End Bound</em>. The pipeline of CPU can conceptually be simplified and divided into two: the frontend and the backend. The <em>frontend</em> is responsible for fetching the program code and decoding them into low-level hardware operations called micro-ops (uOps). The uOps are then fed to the <em>backend</em> in a process called allocation. Once allocated, the backend is responsible for executing the uOp in an available execution unit. A completion of uOpโs execution is called <em>retirement</em>. In contrast, a <em>bad speculation</em> is when speculatively fetched uOps are canceled before retiring such as in the case of mispredicted branches. Each of these metrics can further be broken down in the subsequent levels to pinpoint the bottleneck.</p>
<div class="section" id="tune-for-the-back-end-bound">
<h4>Tune for the Back End Bound<a class="headerlink" href="#tune-for-the-back-end-bound" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h4>
<p>The majority of untuned deep learning workloads will be Back End Bound. Resolving Back End bound is often resolving sources of latency causing retirement to take longer than necessary. As shown above, Back End Bound has two sub-metrics โ Core Bound and Memory Bound.</p>
<p>Memory Bound stalls have causes related to the memory subsystem. For example, last-level cache (LLC or L3 cache) miss causing access to DRAM. Scaling deep learning models often requires significant compute. And high compute utilization requires that data is available when the execution units need it to execute the uOps. This requires prefetching the data and reusing the data in cache instead of fetching that same data multiple times from main memory which causes execution units to be starved while data is being returned. Throughout this tutorial, we wll show that a more efficient memory allocator, operator fusion, memory layout format optimization reduce overhead on Memory Bound with better cache locality.</p>
<p>Core Bound stalls indicate sub-optimal use of available execution units while there are no uncompleted memory accesses. For example, several general matrix-matrix multiplication (GEMM) instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units could cause Core Bound stalls. Key deep learning kernels, including the DP kernels, have been well optimized by <a class="reference external" href="https://github.com/oneapi-src/oneDNN">oneDNN library</a> (oneAPI Deep Neural Network Library), reducing overhead on Core Bound.</p>
<p>Operations like GEMM, convolution, deconvolution are compute-intensive. While operations like pooling, batch normalization, activation functions like ReLU are memory-bound.</p>
</div>
</div>
<div class="section" id="intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt">
<h3>Intelยฎ VTuneโข Profilerโs Instrumentation and Tracing Technology (ITT)<a class="headerlink" href="#intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>The ITT APIs of Intelยฎ VTune Profiler is a useful tool to annotate a region of your workload for tracing to profile and visualize at a finer granularity of your annotation โ OP/function/sub-function granularity. By annotating at the granularity of your PyTorch modelโs OPs, Intelยฎ VTune Profilerโs ITT enables op-level profiling. Intelยฎ VTune Profilerโs ITT has been integrated into <a class="reference external" href="https://tutorials.pytorch.kr/beginner/introyt/autogradyt_tutorial.html#autograd-profiler">PyTorch Autograd Profiler</a>. <sup>1</sup></p>
<ol class="arabic simple">
<li><p>The feature has to be explicitly enabled by <em>with torch.autograd.profiler.emit_itt()</em>.</p></li>
</ol>
</div>
</div>
<div class="section" id="torchserve-with-intel-extension-for-pytorch">
<h2>TorchServe with Intelยฎ Extension for PyTorch*<a class="headerlink" href="#torchserve-with-intel-extension-for-pytorch" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p><a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intelยฎ Extension for PyTorch*</a> is a Python package to extend PyTorch with optimizations for extra performance boost on Intel hardware.</p>
<p>Intelยฎ Extension for PyTorch* has already been integrated into TorchServe to improve the performance out-of-box. <sup>2</sup> For custom handler scripts, we recommend adding the <em>intel_extension_for_pytorch</em> package in.</p>
<ol class="arabic simple" start="2">
<li><p>The feature has to be explicitly enabled by setting <em>ipex_enable=true</em> in <em>config.properties</em>.</p></li>
</ol>
<p>Throughout this section, we will show that Back End Bound is often the primary bottleneck for under-optimized or under-tuned deep learning workloads, and demonstrate optimization techniques via Intelยฎ Extension for PyTorch* for improving Back End Bound, which has two submetrics - Memory Bound, and Core Bound. A more efficient memory allocator, operator fusion, memory layout format optimization improve Memory Bound. Ideally, Memory Bound can be improved to Core Bound by optimized operators and better cache locality. And key deep learning primitives, such as convolution, matrix multiplication, dot-product, have been well optimized by Intelยฎ Extension for PyTorch* and oneDNN library, improving Core Bound.</p>
<div class="section" id="leveraging-advanced-launcher-configuration-memory-allocator">
<h3>Leveraging Advanced Launcher Configuration: Memory Allocator<a class="headerlink" href="#leveraging-advanced-launcher-configuration-memory-allocator" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>Memory allocator plays an important role from performance perspective. A more efficient memory usage reduces overhead on unnecessary memory allocations or destructions, and thus faster execution. For deep learning workloads in practice, especially those running on large multi-core systems or servers like TorchServe, TCMalloc, or JeMalloc can generally get better memory usage than the default PyTorch memory allocator, PTMalloc.</p>
<div class="section" id="tcmalloc-jemalloc-ptmalloc">
<h4>TCMalloc, JeMalloc, PTMalloc<a class="headerlink" href="#tcmalloc-jemalloc-ptmalloc" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h4>
<p>Both TCMalloc and JeMalloc use thread-local caches to reduce overhead on thread synchronization, and lock contention by using spinlocks and per-thread arenas respectively. TCMalloc and JeMalloc reduce overhead on unnecessary memory allocation and deallocation. Both allocators categorize memory allocations by sizes to reduce overhead on memory fragmentation.</p>
<p>With the launcher, users can easily experiment with different memory allocators by choosing one of the three launcher knobs <em>โenable_tcmalloc</em> (TCMalloc), <em>โenable_jemalloc</em> (JeMalloc), <em>โuse_default_allocator</em> (PTMalloc).</p>
<div class="section" id="exercise">
<h5>Exercise<a class="headerlink" href="#exercise" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h5>
<p>Letโs profile PTMalloc vs. JeMalloc.</p>
<p>We will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket to avoid any NUMA complication โ to profile the effect of memory allocator only.</p>
<p>The following example measures the average inference time of ResNet50:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torchvision.models</span> <span class="k">as</span> <span class="nn">models</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">models</span><span class="o">.</span><span class="n">resnet50</span><span class="p">(</span><span class="n">pretrained</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">32</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">batch_size</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>
<span class="c1"># warm up</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
<span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="c1"># measure</span>
<span class="c1"># Intelยฎ VTune Profiler's ITT context manager</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">emit_itt</span><span class="p">():</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
<span class="c1"># Intelยฎ VTune Profiler's ITT to annotate each step</span>
<span class="n">torch</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">itt</span><span class="o">.</span><span class="n">range_push</span><span class="p">(</span><span class="s1">'step_</span><span class="si">{}</span><span class="s1">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
<span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="n">torch</span><span class="o">.</span><span class="n">profiler</span><span class="o">.</span><span class="n">itt</span><span class="o">.</span><span class="n">range_pop</span><span class="p">()</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">'Inference took </span><span class="si">{:.2f}</span><span class="s1"> ms in average'</span><span class="o">.</span><span class="n">format</span><span class="p">((</span><span class="n">end</span><span class="o">-</span><span class="n">start</span><span class="p">)</span><span class="o">/</span><span class="mi">100</span><span class="o">*</span><span class="mi">1000</span><span class="p">))</span>
</pre></div>
</div>
<p>Letโs collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/32.png"><img alt="../_images/32.png" src="../_images/32.png" style="width: 100%;" /></a>
</div>
<p>Level-1 TMA shows that both PTMalloc and JeMalloc are bounded by the backend. More than half of the execution time was stalled by the backend. Letโs go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/41.png"><img alt="../_images/41.png" src="../_images/41.png" style="width: 100%;" /></a>
</div>
<p>Level-2 TMA shows that the Back End Bound was caused by Memory Bound. Letโs go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/51.png"><img alt="../_images/51.png" src="../_images/51.png" style="width: 100%;" /></a>
</div>
<p>Most of the metrics under the Memory Bound identify which level of the memory hierarchy from the L1 cache to main memory is the bottleneck. A hotspot bounded at a given level indicates that most of the data was being retrieved from that cache or memory-level. Optimizations should focus on moving data closer to the core. Level-3 TMA shows that PTMalloc was bottlenecked by DRAM Bound. On the other hand, JeMalloc was bottlenecked by L1 Bound โ JeMalloc moved data closer to the core, and thus faster execution.</p>
<p>Letโs look at Intelยฎ VTune Profiler ITT trace. In the example script, we have annotated each <em>step_x</em> of the inference loop.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/61.png"><img alt="../_images/61.png" src="../_images/61.png" style="width: 100%;" /></a>
</div>
<p>Each step is traced in the timeline graph. The duration of model inference on the last step (step_99) decreased from 304.308 ms to 261.843 ms.</p>
</div>
<div class="section" id="exercise-with-torchserve">
<h5>Exercise with TorchServe<a class="headerlink" href="#exercise-with-torchserve" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h5>
<p>Letโs profile PTMalloc vs. JeMalloc with TorchServe.</p>
<p>We will use <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench">TorchServe apache-bench benchmarking</a> with ResNet50 FP32, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">default parameters</a>.</p>
<p>As in the previous exercise, we will use the launcher to designate the memory allocator, and to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in <a class="reference external" href="https://pytorch.org/serve/configuration.html#config-properties-file">config.properties</a>:</p>
<p>PTMalloc</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span> <span class="o">--</span><span class="n">use_default_allocator</span>
</pre></div>
</div>
<p>JeMalloc</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span> <span class="o">--</span><span class="n">enable_jemalloc</span>
</pre></div>
</div>
<p>Letโs collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/71.png"><img alt="../_images/71.png" src="../_images/71.png" style="width: 100%;" /></a>
</div>
<p>Letโs go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/81.png"><img alt="../_images/81.png" src="../_images/81.png" style="width: 100%;" /></a>
</div>
<p>Letโs use Intelยฎ VTune Profiler ITT to annotate <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py#L188">TorchServe inference scope</a> to profile at inference-level granularity. As <a class="reference external" href="https://github.com/pytorch/serve/blob/master/docs/internals.md#torchserve-architecture">TorchServe Architecture</a> consists of several sub-components, including the Java frontend for handling request/response, and the Python backend for running the actual inference on the models, it is helpful to use Intelยฎ VTune Profiler ITT to limit the collection of trace data at inference-level.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/9.png"><img alt="../_images/9.png" src="../_images/9.png" style="width: 100%;" /></a>
</div>
<p>Each inference call is traced in the timeline graph. The duration of the last model inference decreased from 561.688 ms to 251.287 ms - 2.2x speedup.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/101.png"><img alt="../_images/101.png" src="../_images/101.png" style="width: 100%;" /></a>
</div>
<p>The timeline graph can be expanded to see op-level profiling results. The duration of <em>aten::conv2d</em> decreased from 16.401 ms to 6.392 ms - 2.6x speedup.</p>
<p>In this section, we have demonstrated that JeMalloc can give better performance than the default PyTorch memory allocator, PTMalloc, with efficient thread-local caches improving Back-End-Bound.</p>
</div>
</div>
</div>
<div class="section" id="id1">
<h3>Intelยฎ Extension for PyTorch*<a class="headerlink" href="#id1" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>The three major <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intelยฎ Extension for PyTorch*</a> optimization techniques, Operator, Graph, Runtime, are as shown:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 46%" />
<col style="width: 33%" />
<col style="width: 21%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head" colspan="3"><p>Intelยฎ Extension for PyTorch* Optimization Techniques</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Operator</p></td>
<td><p>Graph</p></td>
<td><p>Runtime</p></td>
</tr>
<tr class="row-odd"><td><ul class="simple">
<li><p>Vectorization and Multi-threading</p></li>
<li><p>Low-precision BF16/INT8 compute</p></li>
<li><p>Data layout optimization for better cache locality</p></li>
</ul>
</td>
<td><ul class="simple">
<li><p>Constant folding to reduce compute</p></li>
<li><p>Op fusion for better cache locality</p></li>
</ul>
</td>
<td><ul class="simple">
<li><p>Thread affinitization</p></li>
<li><p>Memory buffer pooling</p></li>
<li><p>GPU runtime</p></li>
<li><p>Launcher</p></li>
</ul>
</td>
</tr>
</tbody>
</table>
<div class="section" id="operator-optimization">
<h4>Operator Optimization<a class="headerlink" href="#operator-optimization" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h4>
<p>Optimized operators and kernels are registered through PyTorch dispatching mechanism. These operators and kernels are accelerated from native vectorization feature and matrix calculation feature of Intel hardware. During execution, Intelยฎ Extension for PyTorch* intercepts invocation of ATen operators, and replaces the original ones with these optimized ones. Popular operators like Convolution, Linear have been optimized in Intelยฎ Extension for PyTorch*.</p>
<div class="section" id="id2">
<h5>Exercise<a class="headerlink" href="#id2" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h5>
<p>Letโs profile optimized operator with Intelยฎ Extension for PyTorch*. We will compare with and without the lines in code changes.</p>
<p>As in the previous exercises, we will bind the workload to physical cores of the first socket.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="k">return</span> <span class="n">x</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>
<span class="c1">#################### code changes ####################</span>
<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="c1">######################################################</span>
<span class="nb">print</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>The model consists of two operationsโConv2d and ReLU. By printing the model object, we get the following output.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/11.png"><img alt="../_images/11.png" src="../_images/11.png" style="width: 60%;" /></a>
</div>
<p>Letโs collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/121.png"><img alt="../_images/121.png" src="../_images/121.png" style="width: 100%;" /></a>
</div>
<p>Notice the Back End Bound reduced from 68.9 to 38.5 โ 1.8x speedup.</p>
<p>Additionally, letโs profile with PyTorch Profiler.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/131.png"><img alt="../_images/131.png" src="../_images/131.png" style="width: 100%;" /></a>
</div>
<p>Notice the CPU time reduced from 851 us to 310 us โ 2.7X speedup.</p>
</div>
</div>
<div class="section" id="graph-optimization">
<h4>Graph Optimization<a class="headerlink" href="#graph-optimization" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h4>
<p>It is highly recommended for users to take advantage of Intelยฎ Extension for PyTorch* with <a class="reference external" href="https://pytorch.org/docs/stable/jit.html">TorchScript</a> for further graph optimizations. To optimize performance further with TorchScript, Intelยฎ Extension for PyTorch* supports oneDNN fusion of frequently used FP32/BF16 operator patterns, like Conv2D+ReLU, Linear+ReLU, and more to reduce operator/kernel invocation overheads, and for better cache locality. Some operator fusions allow to maintain temporary calculations, data type conversions, data layouts for better cache locality. As well as for INT8, Intelยฎ Extension for PyTorch* has built-in quantization recipes to deliver good statistical accuracy for popular DL workloads including CNN, NLP and recommendation models. The quantized model is then optimized with oneDNN fusion support.</p>
<div class="section" id="id3">
<h5>Exercise<a class="headerlink" href="#id3" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h5>
<p>Letโs profile FP32 graph optimization with TorchScript.</p>
<p>As in the previous exercises, we will bind the workload to physical cores of the first socket.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="k">return</span> <span class="n">x</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>
<span class="c1">#################### code changes ####################</span>
<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="c1">######################################################</span>
<span class="c1"># torchscript</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">trace</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">freeze</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>Letโs collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/141.png"><img alt="../_images/141.png" src="../_images/141.png" style="width: 100%;" /></a>
</div>
<p>Notice the Back End Bound reduced from 67.1 to 37.5 โ 1.8x speedup.</p>
<p>Additionally, letโs profile with PyTorch Profiler.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/151.png"><img alt="../_images/151.png" src="../_images/151.png" style="width: 100%;" /></a>
</div>
<p>Notice that with Intelยฎ Extension for PyTorch* Conv + ReLU operators are fused, and the CPU time reduced from 803 us to 248 us โ 3.2X speedup. The oneDNN eltwise post-op enables fusing a primitive with an elementwise primitive. This is one of the most popular kinds of fusion: an eltwise (typically an activation function such as ReLU) with preceding convolution or inner product. Have a look at the oneDNN verbose log shown in the next section.</p>
</div>
</div>
<div class="section" id="channels-last-memory-format">
<h4>Channels Last Memory Format<a class="headerlink" href="#channels-last-memory-format" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h4>
<p>When invoking <em>ipex.optimize</em> on model, Intelยฎ Extension for PyTorch* automatically converts the model to optimized memory format, channels last. Channels last is a memory format that is more friendly to Intel Architecture. Compared to PyTorch default channels first NCHW (batch, channels, height, width) memory format, channels last NHWC (batch, height, width, channels) memory format generally accelerates convolutional neural networks with better cache locality.</p>
<p>One thing to note is that it is expensive to convert memory format. So itโs better to convert the memory format prior to deployment once, and keep the memory format conversion minimum during deployment. As the data propagates through modelโs layers the channels last memory format is preserved through consecutive channels last supported layers (for example, Conv2d -> ReLU -> Conv2d) and conversions are only made in between channels last unsupported layers. See <a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/programming-model/memory-format-propagation.html">Memory Format Propagation</a> for more details.</p>
<div class="section" id="id4">
<h5>Exercise<a class="headerlink" href="#id4" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h5>
<p>Letโs demonstrate channels last optimization.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="k">class</span> <span class="nc">Model</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="nb">super</span><span class="p">(</span><span class="n">Model</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">conv</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Conv2d</span><span class="p">(</span><span class="mi">16</span><span class="p">,</span> <span class="mi">33</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">stride</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">relu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">ReLU</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">conv</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="k">return</span> <span class="n">x</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">Model</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">16</span><span class="p">,</span> <span class="mi">50</span><span class="p">,</span> <span class="mi">100</span><span class="p">)</span>
<span class="kn">import</span> <span class="nn">intel_extension_for_pytorch</span> <span class="k">as</span> <span class="nn">ipex</span>
<span class="c1">############################### code changes ###############################</span>
<span class="n">ipex</span><span class="o">.</span><span class="n">disable_auto_channels_last</span><span class="p">()</span> <span class="c1"># omit this line for channels_last (default)</span>
<span class="c1">############################################################################</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">ipex</span><span class="o">.</span><span class="n">optimize</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">no_grad</span><span class="p">():</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">trace</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">freeze</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
</pre></div>
</div>
<p>We will use <a class="reference external" href="https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html">oneDNN verbose mode</a>, a tool to help collect information at oneDNN graph level such as operator fusions, kernel execution time spent on executing oneDNN primitives. For more information, refer to the <a class="reference external" href="https://oneapi-src.github.io/oneDNN/index.html">oneDNN Documentation</a>.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/161.png"><img alt="../_images/161.png" src="../_images/161.png" style="width: 15%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/171.png"><img alt="../_images/171.png" src="../_images/171.png" style="width: 100%;" /></a>
</div>
<p>Above is oneDNN verbose from channels first. We can verify that there are reorders from weight and data, then do computation, and finally reorder output back.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/181.png"><img alt="../_images/181.png" src="../_images/181.png" style="width: 80%;" /></a>
</div>
<p>Above is oneDNN verbose from channels last. We can verify that channels last memory format avoids unnecessary reorders.</p>
</div>
</div>
</div>
<div class="section" id="performance-boost-with-intel-extension-for-pytorch">
<h3>Performance Boost with Intelยฎ Extension for PyTorch*<a class="headerlink" href="#performance-boost-with-intel-extension-for-pytorch" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>Below summarizes performance boost of TorchServe with Intelยฎ Extension for PyTorch* for ResNet50 and BERT-base-uncased.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/191.png"><img alt="../_images/191.png" src="../_images/191.png" style="width: 100%;" /></a>
</div>
</div>
<div class="section" id="id5">
<h3>Exercise with TorchServe<a class="headerlink" href="#id5" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h3>
<p>Letโs profile Intelยฎ Extension for PyTorch* optimizations with TorchServe.</p>
<p>We will use <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench">TorchServe apache-bench benchmarking</a> with ResNet50 FP32 TorchScript, batch size 32, concurrency 32, requests 8960. All other parameters are the same as the <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">default parameters</a>.</p>
<p>As in the previous exercise, we will use the launcher to bind the workload to physical cores of the first socket. To do so, user simply needs to add a few lines in <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmark-parameters">config.properties</a>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">cpu_launcher_enable</span><span class="o">=</span><span class="n">true</span>
<span class="n">cpu_launcher_args</span><span class="o">=--</span><span class="n">node_id</span> <span class="mi">0</span>
</pre></div>
</div>
<p>Letโs collect level-1 TMA metrics.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/20.png"><img alt="../_images/20.png" src="../_images/20.png" style="width: 100%;" /></a>
</div>
<p>Level-1 TMA shows that both are bounded by the backend. As discussed earlier, the majority of untuned deep learning workloads will be Back End Bound. Notice the Back End Bound reduced from 70.0 to 54.1. Letโs go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/211.png"><img alt="../_images/211.png" src="../_images/211.png" style="width: 100%;" /></a>
</div>
<p>As discussed earlier, Back End Bound has two submetrics โ Memory Bound and Core Bound. Memory Bound indicates the workload is under-optimized or under-utilized, and ideally memory-bound operations can be improved to core-bound by optimizing the OPs and improving cache locality. Level-2 TMA shows that the Back End Bound improved from Memory Bound to Core Bound. Letโs go one level deeper.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/221.png"><img alt="../_images/221.png" src="../_images/221.png" style="width: 100%;" /></a>
</div>
<p>Scaling deep learning models for production on a model serving framework like TorchServe requires high compute utilization. This requires that data is available through prefetching and reusing the data in cache when the execution units need it to execute the uOps. Level-3 TMA shows that the Back End Memory Bound improved from DRAM Bound to Core Bound.</p>
<p>As in the previous exercise with TorchServe, letโs use Intelยฎ VTune Profiler ITT to annotate <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py#L188">TorchServe inference scope</a> to profile at inference-level granularity.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/231.png"><img alt="../_images/231.png" src="../_images/231.png" style="width: 100%;" /></a>
</div>
<p>Each inference call is traced in the timeline graph. The duration of the last inference call decreased from 215.731 ms to 95.634 ms - 2.3x speedup.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/241.png"><img alt="../_images/241.png" src="../_images/241.png" style="width: 100%;" /></a>
</div>
<p>The timeline graph can be expanded to see op-level profiling results. Notice that Conv + ReLU has been fused, and the duration decreased from 6.393 ms + 1.731 ms to 3.408 ms - 2.4x speedup.</p>
</div>
</div>
<div class="section" id="conclusion">
<h2>Conclusion<a class="headerlink" href="#conclusion" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>In this tutorial, we have used Top-down Microarchitecture Analysis (TMA) and Intelยฎ VTuneโข Profilerโs Instrumentation and Tracing Technology (ITT) to demonstrate that</p>
<ul class="simple">
<li><p>Often the primary bottleneck of under-optimized or under-tuned deep learning workloads are Back End Bound, which has two submetrics, Memory Bound and Core Bound.</p></li>
<li><p>A more efficient memory allocator, operator fusion, memory layout format optimization by Intelยฎ Extension for PyTorch* improve Memory Bound.</p></li>
<li><p>Key deep learning primitives, such as convolution, matrix multiplication, dot-product, etc have been well optimized by Intelยฎ Extension for PyTorch* and oneDNN library, improving Core Bound.</p></li>
<li><p>Intelยฎ Extension for PyTorch* has been integrated into TorchServe with an ease-of-use API.</p></li>
<li><p>TorchServe with Intelยฎ Extension for PyTorch* shows 7.71x throughput speedup for ResNet50, and 2.20x throughput speedup for BERT.</p></li>
</ul>
</div>
<div class="section" id="related-readings">
<h2>Related Readings<a class="headerlink" href="#related-readings" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html">Top-down Microarchitecture Analysis Method</a></p>
<p><a class="reference external" href="https://easyperf.net/blog/2019/02/09/Top-Down-performance-analysis-methodology">Top-Down performance analysis methodology</a></p>
<p><a class="reference external" href="https://medium.com/pytorch/accelerating-pytorch-with-intel-extension-for-pytorch-3aef51ea3722">Accelerating PyTorch with Intelยฎ Extension for PyTorch*</a></p>
</div>
<div class="section" id="acknowledgement">
<h2>Acknowledgement<a class="headerlink" href="#acknowledgement" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this tutorial. We would also like to thank Hamid Shojanazeri (Meta) and Li Ning (AWS) for their helpful feedback in code review and the tutorial.</p>
</div>
</div>
</article>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="nvfuser_intro_tutorial.html" class="btn btn-neutral float-right" title="Getting Started - Accelerate Your Scripts with nvFuser" accesskey="n" rel="next">Next <img src="../_static/images/chevron-right-orange.svg" class="next-page"></a>
<a href="torchserve_with_ipex.html" class="btn btn-neutral" title="Grokking PyTorch Intel CPU performance from first principles" accesskey="p" rel="prev"><img src="../_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a>
</div>
<hr class="community-hr hr-top" />
<div class="community-container">
<div class="community-prompt">๋ ๊ถ๊ธํ์๊ฑฐ๋ ๊ฐ์ ํ ๋ด์ฉ์ด ์์ผ์ ๊ฐ์? ์ปค๋ฎค๋ํฐ์ ์ฐธ์ฌํด๋ณด์ธ์!</div>
<div class="community-link"><a href="https://discuss.pytorch.kr/" aria-label="PyTorchKoreaCommunity">ํ๊ตญ์ด ์ปค๋ฎค๋ํฐ ๋ฐ๋ก๊ฐ๊ธฐ</a></div>
</div>
<hr class="community-hr hr-bottom"/>
<hr class="rating-hr hr-top" />
<div class="rating-container">
<div class="rating-prompt">์ด ํํ ๋ฆฌ์ผ์ด ์ด๋ ์
จ๋์? ํ๊ฐํด์ฃผ์๋ฉด ์ดํ ๊ฐ์ ์ ์ฐธ๊ณ ํ๊ฒ ์ต๋๋ค! :)</div>
<div class="stars-outer">
<i class="far fa-star" title="1 Star" data-behavior="tutorial-rating" data-count="1"></i>
<i class="far fa-star" title="2 Stars" data-behavior="tutorial-rating" data-count="2"></i>
<i class="far fa-star" title="3 Stars" data-behavior="tutorial-rating" data-count="3"></i>
<i class="far fa-star" title="4 Stars" data-behavior="tutorial-rating" data-count="4"></i>
<i class="far fa-star" title="5 Stars" data-behavior="tutorial-rating" data-count="5"></i>
</div>
</div>
<hr class="rating-hr hr-bottom"/>
<div role="contentinfo">
<p>
© Copyright 2018-2024, PyTorch & ํ์ดํ ์น ํ๊ตญ ์ฌ์ฉ์ ๋ชจ์(PyTorch Korea User Group).
</p>
</div>
<div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</div>
</footer>
</div>
<script>
if((window.location.href.indexOf("/prototype/")!= -1) && (window.location.href.indexOf("/prototype/prototype_index")< 1))
{
var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-flask" aria-hidden="true"> </i> ์ด ํํ ๋ฆฌ์ผ์ ํ๋กํ ํ์
(prototype) ๊ธฐ๋ฅ๋ค์ ๋ํด์ ์ค๋ช
ํ๊ณ ์์ต๋๋ค. ํ๋กํ ํ์
๊ธฐ๋ฅ์ ์ผ๋ฐ์ ์ผ๋ก ํผ๋๋ฐฑ ๋ฐ ํ
์คํธ์ฉ์ผ๋ก, ๋ฐํ์ ํ๋๊ทธ ์์ด๋ PyPI๋ Conda๋ก ๋ฐฐํฌ๋๋ ๋ฐ์ด๋๋ฆฌ์์๋ ์ฌ์ฉํ ์ ์์ต๋๋ค.</p></div>'
document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
}
</script>
</div>
<div class="pytorch-content-right" id="pytorch-content-right">
<div class="pytorch-right-menu" id="pytorch-right-menu">
<div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
<ul>
<li><a class="reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles (Part 2)</a><ul>
<li><a class="reference internal" href="#prerequisites">Prerequisites</a><ul>
<li><a class="reference internal" href="#top-down-microarchitecture-analysis-method-tma">Top-down Microarchitecture Analysis Method (TMA)</a><ul>
<li><a class="reference internal" href="#tune-for-the-back-end-bound">Tune for the Back End Bound</a></li>
</ul>
</li>
<li><a class="reference internal" href="#intel-vtune-profiler-s-instrumentation-and-tracing-technology-itt">Intelยฎ VTuneโข Profilerโs Instrumentation and Tracing Technology (ITT)</a></li>
</ul>
</li>
<li><a class="reference internal" href="#torchserve-with-intel-extension-for-pytorch">TorchServe with Intelยฎ Extension for PyTorch*</a><ul>
<li><a class="reference internal" href="#leveraging-advanced-launcher-configuration-memory-allocator">Leveraging Advanced Launcher Configuration: Memory Allocator</a><ul>
<li><a class="reference internal" href="#tcmalloc-jemalloc-ptmalloc">TCMalloc, JeMalloc, PTMalloc</a><ul>
<li><a class="reference internal" href="#exercise">Exercise</a></li>
<li><a class="reference internal" href="#exercise-with-torchserve">Exercise with TorchServe</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#id1">Intelยฎ Extension for PyTorch*</a><ul>
<li><a class="reference internal" href="#operator-optimization">Operator Optimization</a><ul>
<li><a class="reference internal" href="#id2">Exercise</a></li>
</ul>
</li>
<li><a class="reference internal" href="#graph-optimization">Graph Optimization</a><ul>
<li><a class="reference internal" href="#id3">Exercise</a></li>
</ul>
</li>
<li><a class="reference internal" href="#channels-last-memory-format">Channels Last Memory Format</a><ul>
<li><a class="reference internal" href="#id4">Exercise</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#performance-boost-with-intel-extension-for-pytorch">Performance Boost with Intelยฎ Extension for PyTorch*</a></li>
<li><a class="reference internal" href="#id5">Exercise with TorchServe</a></li>
</ul>
</li>
<li><a class="reference internal" href="#conclusion">Conclusion</a></li>
<li><a class="reference internal" href="#related-readings">Related Readings</a></li>
<li><a class="reference internal" href="#acknowledgement">Acknowledgement</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</section>
</div>
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script src="../_static/translations.js"></script>
<script src="../_static/katex.min.js"></script>
<script src="../_static/auto-render.min.js"></script>
<script src="../_static/katex_autorenderer.js"></script>
<script src="../_static/design-tabs.js"></script>
<script type="text/javascript" src="../_static/js/vendor/popper.min.js"></script>
<script type="text/javascript" src="../_static/js/vendor/bootstrap.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script>
// Helper function to make it easier to call dataLayer.push()
function gtag(){window.dataLayer.push(arguments);}
//add microsoft link
if(window.location.href.indexOf("/beginner/basics/")!= -1)
{
var url="https://docs.microsoft.com/learn/paths/pytorch-fundamentals/?wt.mc_id=aiml-7486-cxa";
switch(window.location.pathname.split("/").pop().replace('.html',''))
{
case"quickstart_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/9-quickstart?WT.mc_id=aiml-7486-cxa";
break;
case"tensorqs_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/2-tensors?WT.mc_id=aiml-7486-cxa";
break;
case"data_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/3-data?WT.mc_id=aiml-7486-cxa";
break;
case"transforms_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/4-transforms?WT.mc_id=aiml-7486-cxa";
break;
case"buildmodel_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/5-model?WT.mc_id=aiml-7486-cxa";
break;
case"autogradqs_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/6-autograd?WT.mc_id=aiml-7486-cxa";