-
Notifications
You must be signed in to change notification settings - Fork 257
/
Copy pathtorchserve_with_ipex.html
1072 lines (898 loc) ยท 80.9 KB
/
torchserve_with_ipex.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="ko" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="ko" > <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta property="og:title" content="Grokking PyTorch Intel CPU performance from first principles" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://tutorials.pytorch.kr/intermediate/torchserve_with_ipex.html" />
<meta property="og:site_name" content="PyTorch Tutorials KR" />
<meta property="og:description" content="A case study on the TorchServe inference framework optimized with Intelยฎ Extension for PyTorch*. Authors: Min Jean Cho, Mark Saroufim Reviewers: Ashok Emani, Jiong Gong Getting a strong out-of-box performance for deep learning on CPUs can be tricky but itโs much easier if youโre aware of the main..." />
<meta property="og:image" content="https://tutorials.pytorch.kr/_static/logos/logo-kr-sm-dark.png" />
<meta property="og:image:alt" content="PyTorch Tutorials KR" />
<meta name="description" content="A case study on the TorchServe inference framework optimized with Intelยฎ Extension for PyTorch*. Authors: Min Jean Cho, Mark Saroufim Reviewers: Ashok Emani, Jiong Gong Getting a strong out-of-box performance for deep learning on CPUs can be tricky but itโs much easier if youโre aware of the main..." />
<meta property="og:ignore_canonical" content="true" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Grokking PyTorch Intel CPU performance from first principles — ํ์ดํ ์น ํ๊ตญ์ด ํํ ๋ฆฌ์ผ (PyTorch tutorials in Korean)</title>
<link rel="shortcut icon" href="../_static/favicon.ico"/>
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<!-- <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> -->
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css" type="text/css" />
<link rel="stylesheet" href="../_static/katex-math.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../_static/sg_gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../_static/sphinx-design.5ea377869091fd0449014c60fc090103.min.css" type="text/css" />
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/custom2.css" type="text/css" />
<link rel="index" title="์์ธ" href="../genindex.html" />
<link rel="search" title="๊ฒ์" href="../search.html" />
<link rel="next" title="Grokking PyTorch Intel CPU performance from first principles (Part 2)" href="torchserve_with_ipex_2.html" />
<link rel="prev" title="(๋ฒ ํ) PyTorch์์ Eager Mode๋ฅผ ์ด์ฉํ ์ ์ ์์ํ" href="../advanced/static_quantization_tutorial.html" />
<script src="../_static/js/modernizr.min.js"></script>
<!-- Preload the theme fonts -->
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<!-- Preload the katex fonts -->
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.2/css/all.css" integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>
<div class="container-fluid header-holder tutorials-header" id="header-holder">
<div class="container">
<div class="header-container">
<a class="header-logo" href="https://pytorch.kr/" aria-label="PyTorch"></a>
<div class="main-menu">
<ul>
<li>
<a href="https://pytorch.kr/get-started">์์ํ๊ธฐ</a>
</li>
<li class="active">
<a href="https://tutorials.pytorch.kr/">ํํ ๋ฆฌ์ผ</a>
</li>
<li>
<a href="https://pytorch.kr/hub">ํ๋ธ</a>
</li>
<li>
<a href="https://discuss.pytorch.kr/">์ปค๋ฎค๋ํฐ</a>
</li>
</ul>
</div>
<a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
</div>
</div>
</div>
<body class="pytorch-body">
<div class="table-of-contents-link-wrapper">
<span>Table of Contents</span>
<a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
</div>
<nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
<div class="pytorch-side-scroll">
<div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
<div class="pytorch-left-menu-search">
<div class="version">
2.3.1+cu121
</div>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
<input type="text" name="q" placeholder="Search Tutorials" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ๋ ์ํผ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../recipes/recipes_index.html">๋ชจ๋ ๋ ์ํผ ๋ณด๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prototype/prototype_index.html">๋ชจ๋ ํ๋กํ ํ์
๋ ์ํผ ๋ณด๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ์์ํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/intro.html">ํ์ดํ ์น(PyTorch) ๊ธฐ๋ณธ ์ตํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/quickstart_tutorial.html">๋น ๋ฅธ ์์(Quickstart)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/tensorqs_tutorial.html">ํ
์(Tensor)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/data_tutorial.html">Dataset๊ณผ DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/transforms_tutorial.html">๋ณํ(Transform)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/buildmodel_tutorial.html">์ ๊ฒฝ๋ง ๋ชจ๋ธ ๊ตฌ์ฑํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/autogradqs_tutorial.html"><code class="docutils literal notranslate"><span class="pre">torch.autograd</span></code>๋ฅผ ์ฌ์ฉํ ์๋ ๋ฏธ๋ถ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/optimization_tutorial.html">๋ชจ๋ธ ๋งค๊ฐ๋ณ์ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/basics/saveloadrun_tutorial.html">๋ชจ๋ธ ์ ์ฅํ๊ณ ๋ถ๋ฌ์ค๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Introduction to PyTorch on YouTube</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt.html">PyTorch ์๊ฐ - YouTube ์๋ฆฌ์ฆ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/introyt1_tutorial.html">PyTorch ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensors_deeper_tutorial.html">Pytorch Tensor ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/autogradyt_tutorial.html">The Fundamentals of Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/modelsyt_tutorial.html">Building Models with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/tensorboardyt_tutorial.html">PyTorch TensorBoard Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/trainingyt.html">Training with PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/introyt/captumyt.html">Model Understanding with Captum</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ์ดํ ์น(PyTorch) ๋ฐฐ์ฐ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/deep_learning_60min_blitz.html">PyTorch๋ก ๋ฅ๋ฌ๋ํ๊ธฐ: 60๋ถ๋ง์ ๋์ฅ๋ด๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/pytorch_with_examples.html">์์ ๋ก ๋ฐฐ์ฐ๋ ํ์ดํ ์น(PyTorch)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/nn_tutorial.html"><cite>torch.nn</cite> ์ด <em>์ค์ ๋ก</em> ๋ฌด์์ธ๊ฐ์?</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_tutorial.html">TensorBoard๋ก ๋ชจ๋ธ, ๋ฐ์ดํฐ, ํ์ต ์๊ฐํํ๊ธฐ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ด๋ฏธ์ง/๋น๋์ค</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchvision_tutorial.html">TorchVision Object Detection Finetuning Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/transfer_learning_tutorial.html">์ปดํจํฐ ๋น์ (Vision)์ ์ํ ์ ์ดํ์ต(Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/fgsm_tutorial.html">์ ๋์ ์์ ์์ฑ(Adversarial Example Generation)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dcgan_faces_tutorial.html">DCGAN ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">๋ฐฐํฌ๋ฅผ ์ํด ๋น์ ํธ๋์คํฌ๋จธ(Vision Transformer) ๋ชจ๋ธ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="tiatoolbox_tutorial.html">Whole Slide Image Classification Using PyTorch and TIAToolbox</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ค๋์ค</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_io_tutorial.html">Audio I/O</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_resampling_tutorial.html">Audio Resampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_data_augmentation_tutorial.html">Audio Data Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_extractions_tutorial.html">Audio Feature Extractions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_feature_augmentation_tutorial.html">Audio Feature Augmentation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/audio_datasets_tutorial.html">Audio Datasets</a></li>
<li class="toctree-l1"><a class="reference internal" href="speech_recognition_pipeline_tutorial.html">Speech Recognition with Wav2Vec2</a></li>
<li class="toctree-l1"><a class="reference internal" href="text_to_speech_with_torchaudio.html">Text-to-speech with Tacotron2</a></li>
<li class="toctree-l1"><a class="reference internal" href="forced_alignment_with_torchaudio_tutorial.html">wav2vec2์ ์ด์ฉํ ๊ฐ์ ์ ๋ ฌ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ
์คํธ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/bettertransformer_tutorial.html">Fast Transformer Inference with Better Transformer</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_classification_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: ๋ฌธ์-๋จ์ RNN์ผ๋ก ์ด๋ฆ ๋ถ๋ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="char_rnn_generation_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: ๋ฌธ์-๋จ์ RNN์ผ๋ก ์ด๋ฆ ์์ฑํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="seq2seq_translation_tutorial.html">๊ธฐ์ด๋ถํฐ ์์ํ๋ NLP: Sequence to Sequence ๋คํธ์ํฌ์ Attention์ ์ด์ฉํ ๋ฒ์ญ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/text_sentiment_ngrams_tutorial.html">torchtext ๋ผ์ด๋ธ๋ฌ๋ฆฌ๋ก ํ
์คํธ ๋ถ๋ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/translation_transformer.html"><code class="docutils literal notranslate"><span class="pre">nn.Transformer</span></code> ์ torchtext๋ก ์ธ์ด ๋ฒ์ญํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/torchtext_custom_dataset_tutorial.html">Preprocess custom text dataset using Torchtext</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ฐฑ์๋</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๊ฐํํ์ต</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_q_learning.html">๊ฐํ ํ์ต (DQN) ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="reinforcement_ppo.html">Reinforcement Learning (PPO) with TorchRL Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="mario_rl_tutorial.html">๋ง๋ฆฌ์ค ๊ฒ์ RL ์์ด์ ํธ๋ก ํ์ตํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/pendulum.html">Pendulum: Writing your environment and transforms with TorchRL</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ๋ชจ๋ธ์ ํ๋ก๋์
ํ๊ฒฝ์ ๋ฐฐํฌํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/onnx/intro_onnx.html">Introduction to ONNX</a></li>
<li class="toctree-l1"><a class="reference internal" href="flask_rest_api_tutorial.html">Flask๋ฅผ ์ฌ์ฉํ์ฌ Python์์ PyTorch๋ฅผ REST API๋ก ๋ฐฐํฌํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/Intro_to_TorchScript_tutorial.html">TorchScript ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_export.html">C++์์ TorchScript ๋ชจ๋ธ ๋ก๋ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/super_resolution_with_onnxruntime.html">(์ ํ) PyTorch ๋ชจ๋ธ์ ONNX์ผ๋ก ๋ณํํ๊ณ ONNX ๋ฐํ์์์ ์คํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="realtime_rpi.html">Raspberry Pi 4 ์์ ์ค์๊ฐ ์ถ๋ก (Inference) (30fps!)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ํ๋กํ์ผ๋ง</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch ๋ชจ๋ ํ๋กํ์ผ๋งํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_intro_tutorial.html">Introduction to Holistic Trace Analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hta_trace_diff_tutorial.html">Trace Diff using Holistic Trace Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Code Transforms with FX</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="fx_conv_bn_fuser.html">(๋ฒ ํ) FX์์ ํฉ์ฑ๊ณฑ/๋ฐฐ์น ์ ๊ทํ(Convolution/Batch Norm) ๊ฒฐํฉ๊ธฐ(Fuser) ๋ง๋ค๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx_profiling_tutorial.html">(beta) Building a Simple CPU Performance Profiler with FX</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">ํ๋ก ํธ์๋ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="memory_format_tutorial.html">(๋ฒ ํ) PyTorch๋ฅผ ์ฌ์ฉํ Channels Last ๋ฉ๋ชจ๋ฆฌ ํ์</a></li>
<li class="toctree-l1"><a class="reference internal" href="forward_ad_usage.html">Forward-mode Automatic Differentiation (Beta)</a></li>
<li class="toctree-l1"><a class="reference internal" href="jacobians_hessians.html">Jacobians, Hessians, hvp, vhp, and more: composing function transforms</a></li>
<li class="toctree-l1"><a class="reference internal" href="ensembling.html">๋ชจ๋ธ ์์๋ธ</a></li>
<li class="toctree-l1"><a class="reference internal" href="per_sample_grads.html">Per-sample-gradients</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_frontend.html">PyTorch C++ ํ๋ก ํธ์๋ ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch-script-parallelism.html">TorchScript์ ๋์ ๋ณ๋ ฌ ์ฒ๋ฆฌ(Dynamic Parallelism)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_autograd.html">C++ ํ๋ก ํธ์๋์ ์๋ ๋ฏธ๋ถ (autograd)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">PyTorch ํ์ฅํ๊ธฐ</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="custom_function_double_backward_tutorial.html">Double Backward with Custom Functions</a></li>
<li class="toctree-l1"><a class="reference internal" href="custom_function_conv_bn_tutorial.html">Fusing Convolution and Batch Norm using Custom Function</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/cpp_extension.html">Custom C++ and CUDA Extensions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_ops.html">Extending TorchScript with Custom C++ Operators</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/torch_script_custom_classes.html">์ปค์คํ
C++ ํด๋์ค๋ก TorchScript ํ์ฅํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dispatcher.html">Registering a Dispatched Operator in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/extend_dispatcher.html">Extending dispatcher for a new backend in C++</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/privateuseone.html">Facilitating New Backend Integration by PrivateUse1</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ชจ๋ธ ์ต์ ํ</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../beginner/profiler.html">PyTorch ๋ชจ๋ ํ๋กํ์ผ๋งํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard_profiler_tutorial.html">ํ
์๋ณด๋๋ฅผ ์ด์ฉํ ํ์ดํ ์น ํ๋กํ์ผ๋ฌ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/hyperparameter_tuning_tutorial.html">Ray Tune์ ์ฌ์ฉํ ํ์ดํผํ๋ผ๋ฏธํฐ ํ๋</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/vt_tutorial.html">๋ฐฐํฌ๋ฅผ ์ํด ๋น์ ํธ๋์คํฌ๋จธ(Vision Transformer) ๋ชจ๋ธ ์ต์ ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="parametrizations.html">Parametrizations Tutorial</a></li>
<li class="toctree-l1"><a class="reference internal" href="pruning_tutorial.html">๊ฐ์ง์น๊ธฐ ๊ธฐ๋ฒ(Pruning) ํํ ๋ฆฌ์ผ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/dynamic_quantization_tutorial.html">(๋ฒ ํ) LSTM ๊ธฐ๋ฐ ๋จ์ด ๋จ์ ์ธ์ด ๋ชจ๋ธ์ ๋์ ์์ํ</a></li>
<li class="toctree-l1"><a class="reference internal" href="dynamic_quantization_bert_tutorial.html">(๋ฒ ํ) BERT ๋ชจ๋ธ ๋์ ์์ํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantized_transfer_learning_tutorial.html">(๋ฒ ํ) ์ปดํจํฐ ๋น์ ํํ ๋ฆฌ์ผ์ ์ํ ์์ํ๋ ์ ์ดํ์ต(Quantized Transfer Learning)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/static_quantization_tutorial.html">(๋ฒ ํ) PyTorch์์ Eager Mode๋ฅผ ์ด์ฉํ ์ ์ ์์ํ</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles</a></li>
<li class="toctree-l1"><a class="reference internal" href="torchserve_with_ipex_2.html">Grokking PyTorch Intel CPU performance from first principles (Part 2)</a></li>
<li class="toctree-l1"><a class="reference internal" href="nvfuser_intro_tutorial.html">Getting Started - Accelerate Your Scripts with nvFuser</a></li>
<li class="toctree-l1"><a class="reference internal" href="ax_multiobjective_nas_tutorial.html">Multi-Objective NAS with Ax</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_compile_tutorial.html">Introduction to <code class="docutils literal notranslate"><span class="pre">torch.compile</span></code></a></li>
<li class="toctree-l1"><a class="reference internal" href="inductor_debug_cpu.html">Inductor CPU backend debugging and profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html">(Beta) Scaled Dot Product Attention (SDPA)๋ก ๊ณ ์ฑ๋ฅ ํธ๋์คํฌ๋จธ(Transformers) ๊ตฌํํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#torch-compile-sdpa"><code class="docutils literal notranslate"><span class="pre">torch.compile</span></code> ๊ณผ ํจ๊ป SDPA ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#sdpa-atteition-bias">SDPA๋ฅผ <code class="docutils literal notranslate"><span class="pre">atteition.bias</span></code> ํ์ ํด๋์ค์ ์ฌ์ฉํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="scaled_dot_product_attention_tutorial.html#id8">๊ฒฐ๋ก </a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/knowledge_distillation_tutorial.html">Knowledge Distillation Tutorial</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">๋ณ๋ ฌ ๋ฐ ๋ถ์ฐ ํ์ต</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../distributed/home.html">Distributed and Parallel Training Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/dist_overview.html">PyTorch Distributed Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../beginner/ddp_series_intro.html">Distributed Data Parallel in PyTorch - Video Tutorials</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_parallel_tutorial.html">๋จ์ผ ๋จธ์ ์ ์ฌ์ฉํ ๋ชจ๋ธ ๋ณ๋ ฌํ ๋ชจ๋ฒ ์ฌ๋ก</a></li>
<li class="toctree-l1"><a class="reference internal" href="ddp_tutorial.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ ์ฒ๋ฆฌ ์์ํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_tuto.html">PyTorch๋ก ๋ถ์ฐ ์ดํ๋ฆฌ์ผ์ด์
๊ฐ๋ฐํ๊ธฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_tutorial.html">Getting Started with Fully Sharded Data Parallel(FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="FSDP_adavnced_tutorial.html">Advanced Model Training with Fully Sharded Data Parallel (FSDP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="TP_tutorial.html">Large Scale Transformer model training with Tensor Parallel (TP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="process_group_cpp_extension_tutorial.html">Cpp ํ์ฅ์ ์ฌ์ฉํ ํ๋ก์ธ์ค ๊ทธ๋ฃน ๋ฐฑ์๋ ์ฌ์ฉ์ ์ ์</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_tutorial.html">Getting Started with Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_param_server_tutorial.html">Implementing a Parameter Server Using Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="dist_pipeline_parallel_tutorial.html">Distributed Pipeline Parallelism Using RPC</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc_async_execution.html">Implementing Batch RPC Processing Using Asynchronous Executions</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/rpc_ddp_tutorial.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ(DDP)๊ณผ ๋ถ์ฐ RPC ํ๋ ์์ํฌ ๊ฒฐํฉ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/ddp_pipeline.html">๋ถ์ฐ ๋ฐ์ดํฐ ๋ณ๋ ฌ ์ฒ๋ฆฌ์ ๋ณ๋ ฌ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ์ ์ฌ์ฉํ ํธ๋์คํฌ๋จธ ๋ชจ๋ธ ํ์ต</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/generic_join.html">Distributed Training with Uneven Inputs Using the Join Context Manager</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Edge with ExecuTorch</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/export-to-executorch-tutorial.html">Exporting to ExecuTorch Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href=" https://pytorch.org/executorch/stable/running-a-model-cpp-tutorial.html">Running an ExecuTorch Model in C++ Tutorial</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/tutorials/sdk-integration-tutorial.html">Using the ExecuTorch SDK to Profile a Model</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-ios.html">Building an ExecuTorch iOS Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/demo-apps-android.html">Building an ExecuTorch Android Demo App</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/executorch/stable/examples-end-to-end-to-lower-model-to-delegate.html">Lowering a Model as a Delegate</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">์ถ์ฒ ์์คํ
</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="torchrec_tutorial.html">TorchRec ์๊ฐ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/sharding.html">Exploring TorchRec sharding</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Multimodality</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../beginner/flava_finetuning_tutorial.html">TorchMultimodal ํํ ๋ฆฌ์ผ: FLAVA ๋ฏธ์ธ์กฐ์ </a></li>
</ul>
</div>
</div>
</nav>
<div class="pytorch-container">
<div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
<div class="pytorch-breadcrumbs-wrapper">
<div role="navigation" aria-label="breadcrumbs navigation">
<ul class="pytorch-breadcrumbs">
<li>
<a href="../index.html">
Tutorials
</a> >
</li>
<li>Grokking PyTorch Intel CPU performance from first principles</li>
<li class="pytorch-breadcrumbs-aside">
<a href="../_sources/intermediate/torchserve_with_ipex.rst.txt" rel="nofollow"><img src="../_static/images/view-page-source-icon.svg"></a>
</li>
</ul>
</div>
</div>
<div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
Shortcuts
</div>
</div>
<section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
<div class="pytorch-content-left">
<div class="pytorch-call-to-action-links">
<div id="tutorial-type">intermediate/torchserve_with_ipex</div>
<div id="google-colab-link">
<img class="call-to-action-img" src="../_static/images/pytorch-colab.svg"/>
<div class="call-to-action-desktop-view">Run in Google Colab</div>
<div class="call-to-action-mobile-view">Colab</div>
</div>
<div id="download-notebook-link">
<img class="call-to-action-notebook-img" src="../_static/images/pytorch-download.svg"/>
<div class="call-to-action-desktop-view">Download Notebook</div>
<div class="call-to-action-mobile-view">Notebook</div>
</div>
<div id="github-view-link">
<img class="call-to-action-img" src="../_static/images/pytorch-github.svg"/>
<div class="call-to-action-desktop-view">View on GitHub</div>
<div class="call-to-action-mobile-view">GitHub</div>
</div>
</div>
<div class="rst-content">
<div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
<article itemprop="articleBody" id="pytorch-article" class="pytorch-article">
<div class="section" id="grokking-pytorch-intel-cpu-performance-from-first-principles">
<h1>Grokking PyTorch Intel CPU performance from first principles<a class="headerlink" href="#grokking-pytorch-intel-cpu-performance-from-first-principles" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h1>
<p>A case study on the TorchServe inference framework optimized with <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intelยฎ Extension for PyTorch*</a>.</p>
<p>Authors: Min Jean Cho, Mark Saroufim</p>
<p>Reviewers: Ashok Emani, Jiong Gong</p>
<p>Getting a strong out-of-box performance for deep learning on CPUs can be tricky but itโs much easier if youโre aware of the main problems that affect performance, how to measure them and how to solve them.</p>
<p>TL;DR</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 11%" />
<col style="width: 59%" />
<col style="width: 30%" />
</colgroup>
<tbody>
<tr class="row-odd"><td><p>Problem</p></td>
<td><p>How to measure it</p></td>
<td><p>Solution</p></td>
</tr>
<tr class="row-even"><td><p>Bottlenecked GEMM execution units</p></td>
<td><ul class="simple">
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/spin-time/imbalance-or-serial-spinning-1.html">Imbalance or Serial Spinning</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/front-end-bound.html">Front-End Bound</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/back-end-bound.html">Core Bound</a></p></li>
</ul>
</td>
<td><p>Avoid using logical cores by setting thread affinity to physical cores via core pinning</p></td>
</tr>
<tr class="row-odd"><td><p>Non Uniform Memory Access (NUMA)</p></td>
<td><ul class="simple">
<li><p>Local vs. remote memory access</p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/reference/cpu-metrics-reference/memory-bound/dram-bound/upi-utilization-bound.html">UPI Utilization</a></p></li>
<li><p>Latency in memory accesses</p></li>
<li><p>Thread migration</p></li>
</ul>
</td>
<td><p>Avoid cross-socket computation by setting thread affinity to a specific socket via core pinning</p></td>
</tr>
</tbody>
</table>
<p><em>GEMM (General Matrix Multiply)</em> run on fused-multiply-add (FMA) or dot-product (DP) execution units which will be bottlenecked and cause delays in thread waiting/<em>spinning at synchronization</em> barrier when <em>hyperthreading</em> is enabled - because using logical cores causes insufficient concurrency for all working threads as each logical thread <em>contends for the same core resources</em>. Instead, if we use 1 thread per physical core, we avoid this contention. So we generally recommend <em>avoiding logical cores</em> by setting CPU <em>thread affinity</em> to physical cores via <em>core pinning</em>.</p>
<p>Multi-socket systems have <em>Non-Uniform Memory Access (NUMA)</em> which is a shared memory architecture that describes the placement of main memory modules with respect to processors. But if a process is not NUMA-aware, slow <em>remote memory</em> is frequently accessed when <em>threads migrate</em> cross socket via <em>Intel Ultra Path Interconnect (UPI)</em> during run time. We address this problem by setting CPU <em>thread affinity</em> to a specific socket via <em>core pinning</em>.</p>
<p>Knowing these principles in mind, proper CPU runtime configuration can significantly boost out-of-box performance.</p>
<p>In this blog, weโll walk you through the important runtime configurations you should be aware of from <a class="reference external" href="https://tutorials.pytorch.kr/recipes/recipes/tuning_guide.html#cpu-specific-optimizations">CPU Performance Tuning Guide</a>, explain how they work, how to profile them and how to integrate them within a model serving framework like <a class="reference external" href="https://github.com/pytorch/serve">TorchServe</a> via an easy to use <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch/blob/master/docs/tutorials/performance_tuning/launch_script.md">launch script</a> which weโve <a class="reference external" href="https://github.com/pytorch/serve/pull/1354">integrated</a> <sup>1</sup> natively.</p>
<p>Weโll explain all of these ideas <strong>visually</strong> from <strong>first principles</strong> with lots of <strong>profiles</strong> and show you how we applied our learnings to make out of the box CPU performance on TorchServe better.</p>
<ol class="arabic simple">
<li><p>The feature has to be explicitly enabled by setting <em>cpu_launcher_enable=true</em> in <em>config.properties</em>.</p></li>
</ol>
<div class="section" id="avoid-logical-cores-for-deep-learning">
<h2>Avoid logical cores for deep learning<a class="headerlink" href="#avoid-logical-cores-for-deep-learning" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>Avoiding logical cores for deep learning workloads generally improves performance. To understand this, let us take a step back to GEMM.</p>
<p><strong>Optimizing GEMM optimizes deep learning</strong></p>
<p>The majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM which is at the core of fully connected layers. Fully connected layers have been used for decades since multi-layer perceptrons (MLP) <a class="reference external" href="https://en.wikipedia.org/wiki/Universal_approximation_theorem">proved to be a universal approximator of any continuous function</a>. Any MLP can be entirely represented as GEMM. And even a convolution can be represented as a GEMM by using a <a class="reference external" href="https://en.wikipedia.org/wiki/Toeplitz_matrix">Toepliz matrix</a>.</p>
<p>Returning to the original topic, most GEMM operators benefit from using non-hyperthreading, because the majority of time in deep learning training or inference is spent on millions of repeated operations of GEMM running on fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores. With hyperthreading enabled, OpenMP threads will contend for the same GEMM execution units.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/1_.png"><img alt="../_images/1_.png" src="../_images/1_.png" style="width: 70%;" /></a>
</div>
<p>And if 2 logical threads run GEMM at the same time, they will be sharing the same core resources causing front end bound, such that the overhead from this front end bound is greater than the gain from running both logical threads at the same time.</p>
<p>Therefore we generally recommend avoiding using logical cores for deep learning workloads to achieve good performance. The launch script by default uses physical cores only; however, users can easily experiment with logical vs. physical cores by simply toggling the <code class="docutils literal notranslate"><span class="pre">--use_logical_core</span></code> launch script knob.</p>
<p><strong>Exercise</strong></p>
<p>Weโll use the following example of feeding ResNet50 dummy tensor:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torchvision.models</span> <span class="k">as</span> <span class="nn">models</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">models</span><span class="o">.</span><span class="n">resnet50</span><span class="p">(</span><span class="n">pretrained</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="n">model</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">224</span><span class="p">,</span> <span class="mi">224</span><span class="p">)</span>
<span class="c1"># warm up</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
<span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">100</span><span class="p">):</span>
<span class="n">model</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="n">end</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="s1">'Inference took </span><span class="si">{:.2f}</span><span class="s1"> ms in average'</span><span class="o">.</span><span class="n">format</span><span class="p">((</span><span class="n">end</span><span class="o">-</span><span class="n">start</span><span class="p">)</span><span class="o">/</span><span class="mi">100</span><span class="o">*</span><span class="mi">1000</span><span class="p">))</span>
</pre></div>
</div>
<p>Throughout the blog, weโll use <a class="reference external" href="https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html#gs.v4egjg">Intelยฎ VTuneโข Profiler</a> to profile and verify optimizations. And weโll run all exercises on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. The CPU information is shown in Figure 2.1.</p>
<p>Environment variable <code class="docutils literal notranslate"><span class="pre">OMP_NUM_THREADS</span></code> is used to set the number of threads for parallel region. Weโll compare <code class="docutils literal notranslate"><span class="pre">OMP_NUM_THREADS=2</span></code> with (1) use of logical cores and (2) use of physical cores only.</p>
<ol class="arabic simple">
<li><p>Both OpenMP threads trying to utilize the same GEMM execution units shared by hyperthreading cores (0, 56)</p></li>
</ol>
<p>We can visualize this by running <code class="docutils literal notranslate"><span class="pre">htop</span></code> command on Linux as shown below.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/2.png"><img alt="../_images/2.png" src="../_images/2.png" style="width: 100%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/3.png"><img alt="../_images/3.png" src="../_images/3.png" style="width: 100%;" /></a>
</div>
<p>We notice that the Spin Time is flagged, and Imbalance or Serial Spinning contributed to the majority of it - 4.980 seconds out of the 8.982 seconds total. The Imbalance or Serial Spinning when using logical cores is due to insufficient concurrency of working threads as each logical thread contends for the same core resources.</p>
<p>The Top Hotspots section of the execution summary indicates that <code class="docutils literal notranslate"><span class="pre">__kmp_fork_barrier</span></code> took 4.589 seconds of CPU time - during 9.33% of the CPU execution time, threads were just spinning at this barrier due to thread synchronization.</p>
<ol class="arabic simple" start="2">
<li><p>Each OpenMP thread utilizing GEMM execution units in respective physical cores (0,1)</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/4.png"><img alt="../_images/4.png" src="../_images/4.png" style="width: 80%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/5.png"><img alt="../_images/5.png" src="../_images/5.png" style="width: 80%;" /></a>
</div>
<p>We first note that the execution time dropped from 32 seconds to 23 seconds by avoiding logical cores. While thereโs still some non-negligible Imbalance or Serial Spinning, we note relative improvement from 4.980 seconds to 3.887 seconds.</p>
<p>By not using logical threads (instead, using 1 thread per physical core), we avoid logical threads contending for the same core resources. The Top Hotspots section also indicates relative improvement of <code class="docutils literal notranslate"><span class="pre">__kmp_fork_barrier</span></code> time from 4.589 seconds to 3.530 seconds.</p>
</div>
<div class="section" id="local-memory-access-is-always-faster-than-remote-memory-access">
<h2>Local memory access is always faster than remote memory access<a class="headerlink" href="#local-memory-access-is-always-faster-than-remote-memory-access" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>We generally recommend binding a process to a local socket such that the process does not migrate across sockets. Generally the goal of doing so is to utilize high speed cache on local memory and to avoid remote memory access which can be ~2x slower.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/6.png"><img alt="../_images/6.png" src="../_images/6.png" style="width: 80%;" /></a>
</div>
<p>Figure 1. Two-socket configuration</p>
<p>Figure 1. shows a typical two-socket configuration. Notice that each socket has its own local memory. Sockets are connected to each other via Intel Ultra Path Interconnect (UPI) which allows each socket to access the local memory of another socket called remote memory. Local memory access is always faster than remote memory access.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/7.png"><img alt="../_images/7.png" src="../_images/7.png" style="width: 50%;" /></a>
</div>
<p>Figure 2.1. CPU information</p>
<p>Users can get their CPU information by running <code class="docutils literal notranslate"><span class="pre">lscpu</span></code> command on their Linux machine. Figure 2.1. shows an example of <code class="docutils literal notranslate"><span class="pre">lscpu</span></code> execution on a machine with two Intel(R) Xeon(R) Platinum 8180M CPUs. Notice that there are 28 cores per socket, and 2 threads per core (i.e., hyperthreading is enabled). In other words, there are 28 logical cores in addition to 28 physical cores, giving a total of 56 cores per socket. And there are 2 sockets, giving a total of 112 cores (<code class="docutils literal notranslate"><span class="pre">Thread(s)</span> <span class="pre">per</span> <span class="pre">core</span></code> x <code class="docutils literal notranslate"><span class="pre">Core(s)</span> <span class="pre">per</span> <span class="pre">socket</span></code> x <code class="docutils literal notranslate"><span class="pre">Socket(s)</span></code>).</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/8.png"><img alt="../_images/8.png" src="../_images/8.png" style="width: 100%;" /></a>
</div>
<p>Figure 2.2. CPU information</p>
<p>The 2 sockets are mapped to 2 NUMA nodes (NUMA node 0, NUMA node 1) respectively. Physical cores are indexed prior to logical cores. As shown in Figure 2.2., the first 28 physical cores (0-27) and the first 28 logical cores (56-83) on the first socket are on NUMA node 0. And the second 28 physical cores (28-55) and the second 28 logical cores (84-111) on the second socket are on NUMA node 1. Cores on the same socket share local memory and last level cache (LLC) which is much faster than cross-socket communication via Intel UPI.</p>
<p>Now that we understand NUMA, cross-socket (UPI) traffic, local vs. remote memory access in multi-processor systems, letโs profile and verify our understanding.</p>
<p><strong>Exercise</strong></p>
<p>Weโll reuse the ResNet50 example above.</p>
<p>As we did not pin threads to processor cores of a specific socket, the operating system periodically schedules threads on processor cores located in different sockets.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/9.gif"><img alt="../_images/9.gif" src="../_images/9.gif" style="width: 100%;" /></a>
</div>
<p>Figure 3. CPU usage of non NUMA-aware application. 1 main worker thread was launched, then it launched a physical core number (56) of threads on all cores, including logical cores.</p>
<p>(Aside: If the number of threads is not set by <a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.set_num_threads.html">torch.set_num_threads</a>, the default number of threads is the number of physical cores in a hyperthreading enabled system. This can be verified by <a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.get_num_threads.html">torch.get_num_threads</a>. Hence we see above about half of the cores busy running the example script.)</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/10.png"><img alt="../_images/10.png" src="../_images/10.png" style="width: 100%;" /></a>
</div>
<p>Figure 4. Non-Uniform Memory Access Analysis graph</p>
<p>Figure 4. compares local vs. remote memory access over time. We verify usage of remote memory which could result in sub-optimal performance.</p>
<p><strong>Set thread affinity to reduce remote memory access and cross-socket (UPI) traffic</strong></p>
<p>Pinning threads to cores on the same socket helps maintain locality of memory access. In this example, weโll pin to the physical cores on the first NUMA node (0-27). With the launch script, users can easily experiment with NUMA nodes configuration by simply toggling the <code class="docutils literal notranslate"><span class="pre">--node_id</span></code> launch script knob.</p>
<p>Letโs visualize the CPU usage now.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/11.gif"><img alt="../_images/11.gif" src="../_images/11.gif" style="width: 100%;" /></a>
</div>
<p>Figure 5. CPU usage of NUMA-aware application</p>
<p>1 main worker thread was launched, then it launched threads on all physical cores on the first numa node.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/12.png"><img alt="../_images/12.png" src="../_images/12.png" style="width: 100%;" /></a>
</div>
<p>Figure 6. Non-Uniform Memory Access Analysis graph</p>
<p>As shown in Figure 6., now almost all memory accesses are local accesses.</p>
</div>
<div class="section" id="efficient-cpu-usage-with-core-pinning-for-multi-worker-inference">
<h2>Efficient CPU usage with core pinning for multi-worker inference<a class="headerlink" href="#efficient-cpu-usage-with-core-pinning-for-multi-worker-inference" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>When running multi-worker inference, cores are overlapped (or shared) between workers causing inefficient CPU usage. To address this problem, the launch script equally divides the number of available cores by the number of workers such that each worker is pinned to assigned cores during runtime.</p>
<p><strong>Exercise with TorchServe</strong></p>
<p>For this exercise, letโs apply the CPU performance tuning principles and recommendations that we have discussed so far to <a class="reference external" href="https://github.com/pytorch/serve/tree/master/benchmarks#benchmarking-with-apache-bench">TorchServe apache-bench benchmarking</a>.</p>
<p>Weโll use ResNet50 with 4 workers, concurrency 100, requests 10,000. All other parameters (e.g., batch_size, input, etc) are the same as the <a class="reference external" href="https://github.com/pytorch/serve/blob/master/benchmarks/benchmark-ab.py#L18">default parameters</a>.</p>
<p>Weโll compare the following three configurations:</p>
<ol class="arabic simple">
<li><p>default TorchServe setting (no core pinning)</p></li>
<li><p><a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.set_num_threads.html">torch.set_num_threads</a> = <code class="docutils literal notranslate"><span class="pre">number</span> <span class="pre">of</span> <span class="pre">physical</span> <span class="pre">cores</span> <span class="pre">/</span> <span class="pre">number</span> <span class="pre">of</span> <span class="pre">workers</span></code> (no core pinning)</p></li>
<li><p>core pinning via the launch script (Required Torchserve>=0.6.1)</p></li>
</ol>
<p>After this exercise, weโll have verified that we prefer avoiding logical cores and prefer local memory access via core pinning with a real TorchServe use case.</p>
</div>
<div class="section" id="default-torchserve-setting-no-core-pinning">
<h2>1. Default TorchServe setting (no core pinning)<a class="headerlink" href="#default-torchserve-setting-no-core-pinning" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>The <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py">base_handler</a> doesnโt explicitly set <a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.set_num_threads.html">torch.set_num_threads</a>. Hence the default number of threads is the number of physical CPU cores as described <a class="reference external" href="https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html#runtime-api">here</a>. Users can check the number of threads by <a class="reference external" href="https://pytorch.org/docs/stable/generated/torch.get_num_threads.html">torch.get_num_threads</a> in the base_handler. Each of the 4 main worker threads launches a physical core number (56) of threads, launching a total of 56x4 = 224 threads, which is more than the total number of cores 112. Therefore cores are guaranteed to be heavily overlapped with high logical core utilization- multiple workers using multiple cores at the same time. Furthermore, because threads are not affinitized to specific CPU cores, the operating system periodically schedules threads to cores located in different sockets.</p>
<ol class="arabic simple">
<li><p>CPU usage</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/13.png"><img alt="../_images/13.png" src="../_images/13.png" style="width: 100%;" /></a>
</div>
<p>4 main worker threads were launched, then each launched a physical core number (56) of threads on all cores, including logical cores.</p>
<ol class="arabic simple" start="2">
<li><p>Core Bound stalls</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/14.png"><img alt="../_images/14.png" src="../_images/14.png" style="width: 80%;" /></a>
</div>
<p>We observe a very high Core Bound stall of 88.4%, decreasing pipeline efficiency. Core Bound stalls indicate sub-optimal use of available execution units in the CPU. For example, several GEMM instructions in a row competing for fused-multiply-add (FMA) or dot-product (DP) execution units shared by hyperthreading cores could cause Core Bound stalls. And as described in the previous section, use of logical cores amplifies this problem.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/15.png"><img alt="../_images/15.png" src="../_images/15.png" style="width: 40%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/16.png"><img alt="../_images/16.png" src="../_images/16.png" style="width: 50%;" /></a>
</div>
<p>An empty pipeline slot not filled with micro-ops (uOps) is attributed to a stall. For example, without core pinning CPU usage may not effectively be on compute but on other operations like thread scheduling from Linux kernel. We see above that <code class="docutils literal notranslate"><span class="pre">__sched_yield</span></code> contributed to the majority of the Spin Time.</p>
<ol class="arabic simple" start="3">
<li><p>Thread Migration</p></li>
</ol>
<p>Without core pinning, scheduler may migrate thread executing on a core to a different core. Thread migration can disassociate the thread from data that has already been fetched into the caches resulting in longer data access latencies. This problem is exacerbated in NUMA systems when thread migrates across sockets. Data that has been fetched to high speed cache on local memory now becomes remote memory, which is much slower.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/17.png"><img alt="../_images/17.png" src="../_images/17.png" style="width: 50%;" /></a>
</div>
<p>Generally the total number of threads should be less than or equal to the total number of threads supported by the core. In the above example, we notice a large number of threads executing on core_51 instead of the expected 2 threads (since hyperthreading is enabled in Intel(R) Xeon(R) Platinum 8180 CPUs) . This indicates thread migration.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/18.png"><img alt="../_images/18.png" src="../_images/18.png" style="width: 80%;" /></a>
</div>
<p>Additionally, notice that thread (TID:97097) was executing on a large number of CPU cores, indicating CPU migration. For example, this thread was executing on cpu_81, then migrated to cpu_14, then migrated to cpu_5, and so on. Furthermore, note that this thread migrated cross socket back and forth many times, resulting in very inefficient memory access. For example, this thread executed on cpu_70 (NUMA node 0), then migrated to cpu_100 (NUMA node 1), then migrated to cpu_24 (NUMA node 0).</p>
<ol class="arabic simple" start="4">
<li><p>Non Uniform Memory Access Analysis</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/19.png"><img alt="../_images/19.png" src="../_images/19.png" style="width: 100%;" /></a>
</div>
<p>Compare local vs. remote memory access over time. We observe that about half, 51.09%, of the memory accesses were remote accesses, indicating sub-optimal NUMA configuration.</p>
</div>
<div class="section" id="torch-set-num-threads-number-of-physical-cores-number-of-workers-no-core-pinning">
<h2>2. torch.set_num_threads = <code class="docutils literal notranslate"><span class="pre">number</span> <span class="pre">of</span> <span class="pre">physical</span> <span class="pre">cores</span> <span class="pre">/</span> <span class="pre">number</span> <span class="pre">of</span> <span class="pre">workers</span></code> (no core pinning)<a class="headerlink" href="#torch-set-num-threads-number-of-physical-cores-number-of-workers-no-core-pinning" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>For an apple-to-apple comparison with launcherโs core pinning, weโll set the number of threads to the number of cores divided by the number of workers (launcher does this internally). Add the following code snippet in the <a class="reference external" href="https://github.com/pytorch/serve/blob/master/ts/torch_handler/base_handler.py">base_handler</a>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">torch</span><span class="o">.</span><span class="n">set_num_threads</span><span class="p">(</span><span class="n">num_physical_cores</span><span class="o">/</span><span class="n">num_workers</span><span class="p">)</span>
</pre></div>
</div>
<p>As before without core pinning, these threads are not affinitized to specific CPU cores, causing the operating system to periodically schedule threads on cores located in different sockets.</p>
<ol class="arabic simple">
<li><p>CPU usage</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/20.gif"><img alt="../_images/20.gif" src="../_images/20.gif" style="width: 100%;" /></a>
</div>
<p>4 main worker threads were launched, then each launched a <code class="docutils literal notranslate"><span class="pre">num_physical_cores/num_workers</span></code> number (14) of threads on all cores, including logical cores.</p>
<ol class="arabic simple" start="2">
<li><p>Core Bound stalls</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/21.png"><img alt="../_images/21.png" src="../_images/21.png" style="width: 80%;" /></a>
</div>
<p>Although the percentage of Core Bound stalls has decreased from 88.4% to 73.5%, the Core Bound is still very high.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/22.png"><img alt="../_images/22.png" src="../_images/22.png" style="width: 40%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/23.png"><img alt="../_images/23.png" src="../_images/23.png" style="width: 50%;" /></a>
</div>
<ol class="arabic simple" start="3">
<li><p>Thread Migration</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/24.png"><img alt="../_images/24.png" src="../_images/24.png" style="width: 75%;" /></a>
</div>
<p>Similar as before, without core pinning thread (TID:94290) was executing on a large number of CPU cores, indicating CPU migration. We notice again cross-socket thread migration, resulting in very inefficient memory access. For example, this thread executed on cpu_78 (NUMA node 0), then migrated to cpu_108 (NUMA node 1).</p>
<ol class="arabic simple" start="4">
<li><p>Non Uniform Memory Access Analysis</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/25.png"><img alt="../_images/25.png" src="../_images/25.png" style="width: 100%;" /></a>
</div>
<p>Although an improvement from the original 51.09%, still 40.45% of memory access is remote, indicating sub-optimal NUMA configuration.</p>
</div>
<div class="section" id="launcher-core-pinning">
<h2>3. launcher core pinning<a class="headerlink" href="#launcher-core-pinning" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>Launcher will internally equally distribute physical cores to workers, and bind them to each worker. As a reminder, launcher by default uses physical cores only. In this example, launcher will bind worker 0 to cores 0-13 (NUMA node 0), worker 1 to cores 14-27 (NUMA node 0), worker 2 to cores 28-41 (NUMA node 1), and worker 3 to cores 42-55 (NUMA node 1). Doing so ensures that cores are not overlapped among workers and avoids logical core usage.</p>
<ol class="arabic simple">
<li><p>CPU usage</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/26.gif"><img alt="../_images/26.gif" src="../_images/26.gif" style="width: 100%;" /></a>
</div>
<p>4 main worker threads were launched, then each launched a <code class="docutils literal notranslate"><span class="pre">num_physical_cores/num_workers</span></code> number (14) of threads affinitized to the assigned physical cores.</p>
<ol class="arabic simple" start="2">
<li><p>Core Bound stalls</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/27.png"><img alt="../_images/27.png" src="../_images/27.png" style="width: 80%;" /></a>
</div>
<p>Core Bound stalls has decreased significantly from the original 88.4% to 46.2% - almost a 2x improvement.</p>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/28.png"><img alt="../_images/28.png" src="../_images/28.png" style="width: 40%;" /></a>
</div>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/29.png"><img alt="../_images/29.png" src="../_images/29.png" style="width: 50%;" /></a>
</div>
<p>We verify that with core binding, most CPU time is effectively used on compute - Spin Time of 0.256s.</p>
<ol class="arabic simple" start="3">
<li><p>Thread Migration</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/30.png"><img alt="../_images/30.png" src="../_images/30.png" style="width: 100%;" /></a>
</div>
<p>We verify that <cite>OMP Primary Thread #0</cite> was bound to assigned physical cores (42-55), and did not migrate cross-socket.</p>
<ol class="arabic simple" start="4">
<li><p>Non Uniform Memory Access Analysis</p></li>
</ol>
<div class="figure align-center">
<a class="reference internal image-reference" href="../_images/31.png"><img alt="../_images/31.png" src="../_images/31.png" style="width: 100%;" /></a>
</div>
<p>Now almost all, 89.52%, memory accesses are local accesses.</p>
</div>
<div class="section" id="conclusion">
<h2>Conclusion<a class="headerlink" href="#conclusion" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>In this blog, weโve showcased that properly setting your CPU runtime configuration can significantly boost out-of-box CPU performance.</p>
<p>We have walked through some general CPU performance tuning principles and recommendations:</p>
<ul class="simple">
<li><p>In a hyperthreading enabled system, avoid logical cores by setting thread affinity to physical cores only via core pinning.</p></li>
<li><p>In a multi-socket system with NUMA, avoid cross-socket remote memory access by setting thread affinity to a specific socket via core pinning.</p></li>
</ul>
<p>We have visually explained these ideas from first principles and have verified the performance boost with profiling. And finally, we have applied all of our learnings to TorchServe to boost out-of-box TorchServe CPU performance.</p>
<p>These principles can be automatically configured via an easy to use launch script which has already been integrated into TorchServe.</p>
<p>For interested readers, please check out the following documents:</p>
<ul class="simple">
<li><p><a class="reference external" href="https://tutorials.pytorch.kr/recipes/recipes/tuning_guide.html#cpu-specific-optimizations">CPU specific optimizations</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/developer/articles/technical/how-to-get-better-performance-on-pytorchcaffe2-with-intel-acceleration.html">Maximize Performance of Intelยฎ Software Optimization for PyTorch* on CPU</a></p></li>
<li><p><a class="reference external" href="https://intel.github.io/intel-extension-for-pytorch/tutorials/performance_tuning/tuning_guide.html">Performance Tuning Guide</a></p></li>
<li><p><a class="reference external" href="https://intel.github.io/intel-extension-for-pytorch/tutorials/performance_tuning/launch_script.html">Launch Script Usage Guide</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/top-down-microarchitecture-analysis-method.html">Top-down Microarchitecture Analysis Method</a></p></li>
<li><p><a class="reference external" href="https://oneapi-src.github.io/oneDNN/dev_guide_performance_settings.html#benchmarking-settings">Configuring oneDNN for Benchmarking</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html#gs.tcbgpa">Intelยฎ VTuneโข Profiler</a></p></li>
<li><p><a class="reference external" href="https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top.html">Intelยฎ VTuneโข Profiler User Guide</a></p></li>
</ul>
<p>And stay tuned for a follow-up posts on optimized kernels on CPU via <a class="reference external" href="https://github.com/intel/intel-extension-for-pytorch">Intelยฎ Extension for PyTorch*</a> and advanced launcher configurations such as memory allocator.</p>
</div>
<div class="section" id="acknowledgement">
<h2>Acknowledgement<a class="headerlink" href="#acknowledgement" title="์ด ์ ๋ชฉ์ ๋ํ ํผ๋จธ๋งํฌ">ยถ</a></h2>
<p>We would like to thank Ashok Emani (Intel) and Jiong Gong (Intel) for their immense guidance and support, and thorough feedback and reviews throughout many steps of this blog. We would also like to thank Hamid Shojanazeri (Meta), Li Ning (AWS) and Jing Xu (Intel) for helpful feedback in code review. And Suraj Subramanian (Meta) and Geeta Chauhan (Meta) for helpful feedback on the blog.</p>
</div>
</div>
</article>
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="torchserve_with_ipex_2.html" class="btn btn-neutral float-right" title="Grokking PyTorch Intel CPU performance from first principles (Part 2)" accesskey="n" rel="next">Next <img src="../_static/images/chevron-right-orange.svg" class="next-page"></a>
<a href="../advanced/static_quantization_tutorial.html" class="btn btn-neutral" title="(๋ฒ ํ) PyTorch์์ Eager Mode๋ฅผ ์ด์ฉํ ์ ์ ์์ํ" accesskey="p" rel="prev"><img src="../_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a>
</div>
<hr class="community-hr hr-top" />
<div class="community-container">
<div class="community-prompt">๋ ๊ถ๊ธํ์๊ฑฐ๋ ๊ฐ์ ํ ๋ด์ฉ์ด ์์ผ์ ๊ฐ์? ์ปค๋ฎค๋ํฐ์ ์ฐธ์ฌํด๋ณด์ธ์!</div>
<div class="community-link"><a href="https://discuss.pytorch.kr/" aria-label="PyTorchKoreaCommunity">ํ๊ตญ์ด ์ปค๋ฎค๋ํฐ ๋ฐ๋ก๊ฐ๊ธฐ</a></div>
</div>
<hr class="community-hr hr-bottom"/>
<hr class="rating-hr hr-top" />
<div class="rating-container">
<div class="rating-prompt">์ด ํํ ๋ฆฌ์ผ์ด ์ด๋ ์
จ๋์? ํ๊ฐํด์ฃผ์๋ฉด ์ดํ ๊ฐ์ ์ ์ฐธ๊ณ ํ๊ฒ ์ต๋๋ค! :)</div>
<div class="stars-outer">
<i class="far fa-star" title="1 Star" data-behavior="tutorial-rating" data-count="1"></i>
<i class="far fa-star" title="2 Stars" data-behavior="tutorial-rating" data-count="2"></i>
<i class="far fa-star" title="3 Stars" data-behavior="tutorial-rating" data-count="3"></i>
<i class="far fa-star" title="4 Stars" data-behavior="tutorial-rating" data-count="4"></i>
<i class="far fa-star" title="5 Stars" data-behavior="tutorial-rating" data-count="5"></i>
</div>
</div>
<hr class="rating-hr hr-bottom"/>
<div role="contentinfo">
<p>
© Copyright 2018-2024, PyTorch & ํ์ดํ ์น ํ๊ตญ ์ฌ์ฉ์ ๋ชจ์(PyTorch Korea User Group).
</p>
</div>
<div>
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
</div>
</footer>
</div>
<script>
if((window.location.href.indexOf("/prototype/")!= -1) && (window.location.href.indexOf("/prototype/prototype_index")< 1))
{
var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-flask" aria-hidden="true"> </i> ์ด ํํ ๋ฆฌ์ผ์ ํ๋กํ ํ์
(prototype) ๊ธฐ๋ฅ๋ค์ ๋ํด์ ์ค๋ช
ํ๊ณ ์์ต๋๋ค. ํ๋กํ ํ์
๊ธฐ๋ฅ์ ์ผ๋ฐ์ ์ผ๋ก ํผ๋๋ฐฑ ๋ฐ ํ
์คํธ์ฉ์ผ๋ก, ๋ฐํ์ ํ๋๊ทธ ์์ด๋ PyPI๋ Conda๋ก ๋ฐฐํฌ๋๋ ๋ฐ์ด๋๋ฆฌ์์๋ ์ฌ์ฉํ ์ ์์ต๋๋ค.</p></div>'
document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
}
</script>
</div>
<div class="pytorch-content-right" id="pytorch-content-right">
<div class="pytorch-right-menu" id="pytorch-right-menu">
<div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
<ul>
<li><a class="reference internal" href="#">Grokking PyTorch Intel CPU performance from first principles</a><ul>
<li><a class="reference internal" href="#avoid-logical-cores-for-deep-learning">Avoid logical cores for deep learning</a></li>
<li><a class="reference internal" href="#local-memory-access-is-always-faster-than-remote-memory-access">Local memory access is always faster than remote memory access</a></li>
<li><a class="reference internal" href="#efficient-cpu-usage-with-core-pinning-for-multi-worker-inference">Efficient CPU usage with core pinning for multi-worker inference</a></li>
<li><a class="reference internal" href="#default-torchserve-setting-no-core-pinning">1. Default TorchServe setting (no core pinning)</a></li>
<li><a class="reference internal" href="#torch-set-num-threads-number-of-physical-cores-number-of-workers-no-core-pinning">2. torch.set_num_threads = <code class="docutils literal notranslate"><span class="pre">number</span> <span class="pre">of</span> <span class="pre">physical</span> <span class="pre">cores</span> <span class="pre">/</span> <span class="pre">number</span> <span class="pre">of</span> <span class="pre">workers</span></code> (no core pinning)</a></li>
<li><a class="reference internal" href="#launcher-core-pinning">3. launcher core pinning</a></li>
<li><a class="reference internal" href="#conclusion">Conclusion</a></li>
<li><a class="reference internal" href="#acknowledgement">Acknowledgement</a></li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</section>
</div>
<script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script src="../_static/translations.js"></script>
<script src="../_static/katex.min.js"></script>
<script src="../_static/auto-render.min.js"></script>
<script src="../_static/katex_autorenderer.js"></script>
<script src="../_static/design-tabs.js"></script>
<script type="text/javascript" src="../_static/js/vendor/popper.min.js"></script>
<script type="text/javascript" src="../_static/js/vendor/bootstrap.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script>
<script type="text/javascript" src="../_static/js/theme.js"></script>
<script type="text/javascript">
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
<script>
// Helper function to make it easier to call dataLayer.push()
function gtag(){window.dataLayer.push(arguments);}
//add microsoft link
if(window.location.href.indexOf("/beginner/basics/")!= -1)
{
var url="https://docs.microsoft.com/learn/paths/pytorch-fundamentals/?wt.mc_id=aiml-7486-cxa";
switch(window.location.pathname.split("/").pop().replace('.html',''))
{
case"quickstart_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/9-quickstart?WT.mc_id=aiml-7486-cxa";
break;
case"tensorqs_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/2-tensors?WT.mc_id=aiml-7486-cxa";
break;
case"data_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/3-data?WT.mc_id=aiml-7486-cxa";
break;
case"transforms_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/4-transforms?WT.mc_id=aiml-7486-cxa";
break;
case"buildmodel_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/5-model?WT.mc_id=aiml-7486-cxa";
break;
case"autogradqs_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/6-autograd?WT.mc_id=aiml-7486-cxa";
break;
case"optimization_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/7-optimization?WT.mc_id=aiml-7486-cxa";
break;
case"saveloadrun_tutorial":
url="https://docs.microsoft.com/learn/modules/intro-machine-learning-pytorch/8-inference?WT.mc_id=aiml-7486-cxa";
}
$(".pytorch-call-to-action-links").children().first().before("<a href="+url+' data-behavior="call-to-action-event" data-response="Run in Microsoft Learn" target="_blank"><div id="microsoft-learn-link" style="padding-bottom: 0.625rem;border-bottom: 1px solid #f3f4f7;padding-right: 2.5rem;display: -webkit-box; display: -ms-flexbox; display: flex; -webkit-box-align: center;-ms-flex-align: center;align-items: center;"><img class="call-to-action-img" src="../../_static/images/microsoft-logo.svg"/><div class="call-to-action-desktop-view">Run in Microsoft Learn</div><div class="call-to-action-mobile-view">Learn</div></div></a>')
}
</script>
<script async src="https://www.googletagmanager.com/gtag/js?id=G-LZRD6GXDLF"></script>
<script data-cfasync="false">
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-LZRD6GXDLF'); // GA4
gtag('config', 'UA-71919972-3'); // UA
</script>
<script data-cfasync="false">
$("[data-behavior='call-to-action-event']").on('click', function(){
ga('send', {
hitType: 'event',
eventCategory: $(this).attr("data-response"),
eventAction: 'click',
eventLabel: window.location.href
});
gtag('event', 'click', {
'event_category': $(this).attr("data-response"),
'event_label': $("h1").first().text(),
'tutorial_link': window.location.href
});
});
$("[data-behavior='tutorial-rating']").on('click', function(){
gtag('event', 'click', {
'event_category': 'Tutorial Rating',
'event_label': $("h1").first().text(),
'value': $(this).attr("data-count"),
'customEvent:Rating': $(this).attr("data-count") // send to GA custom dimension customEvent:Rating.
});
});
if (location.pathname == "/") {
$(".rating-container").hide();
$(".hr-bottom").hide();
}
</script>
<script type="text/javascript">
var collapsedSections = ['ํ์ดํ ์น(PyTorch) ๋ ์ํผ', 'ํ์ดํ ์น(PyTorch) ๋ฐฐ์ฐ๊ธฐ', 'Introduction to PyTorch on YouTube', '์ด๋ฏธ์ง/๋น๋์ค', '์ค๋์ค', 'ํ
์คํธ', '๋ฐฑ์๋', '๊ฐํํ์ต', 'PyTorch ๋ชจ๋ธ์ ํ๋ก๋์
ํ๊ฒฝ์ ๋ฐฐํฌํ๊ธฐ', 'PyTorch ํ๋กํ์ผ๋ง', 'Code Transforms with FX', 'ํ๋ก ํธ์๋ API', 'PyTorch ํ์ฅํ๊ธฐ', '๋ชจ๋ธ ์ต์ ํ', '๋ณ๋ ฌ ๋ฐ ๋ถ์ฐ ํ์ต', 'Edge with ExecuTorch', '์ถ์ฒ ์์คํ
', 'Multimodality'];
</script>
<!-- Begin Footer -->
<div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources">
<div class="container">
<div class="row">
<div class="col-md-4 text-center">
<h2>PyTorchKorea @ GitHub</h2>
<p>ํ์ดํ ์น ํ๊ตญ ์ฌ์ฉ์ ๋ชจ์์ GitHub์์ ๋ง๋๋ณด์ธ์.</p>
<a class="with-right-arrow" href="https://github.com/PyTorchKorea" target="_blank">GitHub๋ก ์ด๋</a>
</div>
<div class="col-md-4 text-center">
<h2>ํ๊ตญ์ด ํํ ๋ฆฌ์ผ</h2>
<p>ํ๊ตญ์ด๋ก ๋ฒ์ญ ์ค์ธ PyTorch ํํ ๋ฆฌ์ผ์
๋๋ค.</p>
<a class="with-right-arrow" href="https://tutorials.pytorch.kr/">ํํ ๋ฆฌ์ผ๋ก ์ด๋</a>
</div>
<div class="col-md-4 text-center">
<h2>์ปค๋ฎค๋ํฐ</h2>
<p>๋ค๋ฅธ ์ฌ์ฉ์๋ค๊ณผ ์๊ฒฌ์ ๋๋๊ณ , ๋์์ฃผ์ธ์!</p>
<a class="with-right-arrow" href="https://discuss.pytorch.kr/">์ปค๋ฎค๋ํฐ๋ก ์ด๋</a>
</div>
</div>
</div>
</div>
<footer class="site-footer">
<div class="container footer-container">
<div class="footer-logo-wrapper">
<a href="https://pytorch.kr/" class="footer-logo"></a>
</div>
<div class="footer-links-wrapper">
<div class="footer-links-col">
<ul>
<li class="list-title"><a href="https://pytorch.kr/">ํ์ดํ ์น ํ๊ตญ ์ฌ์ฉ์ ๋ชจ์</a></li>
<li><a href="https://pytorch.kr//about">์ฌ์ฉ์ ๋ชจ์ ์๊ฐ</a></li>
<li><a href="https://pytorch.kr//about/contributors">๊ธฐ์ฌํด์ฃผ์ ๋ถ๋ค</a></li>
<li><a href="https://pytorch.kr//resources">๋ฆฌ์์ค</a></li>
<li><a href="https://pytorch.kr//coc">ํ๋ ๊ฐ๋ น</a></li>