-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.html
1252 lines (789 loc) · 147 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head><meta name="generator" content="Hexo 3.8.0">
<meta charset="utf-8">
<meta name="renderer" content="webkit">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<link rel="dns-prefetch" href="http://yoursite.com">
<title>梅溪先生</title>
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta property="og:type" content="website">
<meta property="og:title" content="梅溪先生">
<meta property="og:url" content="http://yoursite.com/index.html">
<meta property="og:site_name" content="梅溪先生">
<meta property="og:locale" content="default">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="梅溪先生">
<link rel="alternative" href="/atom.xml" title="梅溪先生" type="application/atom+xml">
<link rel="icon" href="/favicon.png">
<link rel="stylesheet" type="text/css" href="/./main.0cf68a.css">
<style type="text/css">
#container.show {
background: linear-gradient(200deg,#a0cfe4,#e8c37e);
}
</style>
</head>
</html>
<body>
<div id="container" q-class="show:isCtnShow">
<canvas id="anm-canvas" class="anm-canvas"></canvas>
<div class="left-col" q-class="show:isShow">
<div class="overlay" style="background: #4d4d4d"></div>
<div class="intrude-less">
<header id="header" class="inner">
<a href="/" class="profilepic">
<img src="/images/head.jpg" class="js-avatar">
</a>
<hgroup>
<h1 class="header-author"><a href="/">梅溪</a></h1>
</hgroup>
<nav class="header-menu">
<ul>
<li><a href="/">主页</a></li>
<li><a href="/tags/大数据">大数据</a></li>
</ul>
</nav>
<nav class="header-smart-menu">
<a q-on="click: openSlider(e, 'innerArchive')" href="javascript:void(0)">所有文章</a>
<a q-on="click: openSlider(e, 'aboutme')" href="javascript:void(0)">关于我</a>
</nav>
<nav class="header-nav">
<div class="social">
<a class="zhihu" target="_blank" href="https://www.zhihu.com/people/zhuo-er-bu-lang-de/activities" title="zhihu"><i class="icon-zhihu"></i></a>
<a class="mail" target="_blank" href="mailto:zhengyt@dtdream.com" title="mail"><i class="icon-mail"></i></a>
</div>
</nav>
</header>
</div>
</div>
<div class="mid-col" q-class="show:isShow,hide:isShow|isFalse">
<nav id="mobile-nav">
<div class="overlay js-overlay" style="background: #4d4d4d"></div>
<div class="btnctn js-mobile-btnctn">
<div class="slider-trigger list" q-on="click: openSlider(e)"><i class="icon icon-sort"></i></div>
</div>
<div class="intrude-less">
<header id="header" class="inner">
<div class="profilepic">
<img src="/images/head.jpg" class="js-avatar">
</div>
<hgroup>
<h1 class="header-author js-header-author">梅溪</h1>
</hgroup>
<nav class="header-nav">
<div class="social">
<a class="zhihu" target="_blank" href="https://www.zhihu.com/people/zhuo-er-bu-lang-de/activities" title="zhihu"><i class="icon-zhihu"></i></a>
<a class="mail" target="_blank" href="mailto:zhengyt@dtdream.com" title="mail"><i class="icon-mail"></i></a>
</div>
</nav>
<nav class="header-menu js-header-menu">
<ul style="width: 50%">
<li style="width: 50%"><a href="/">主页</a></li>
<li style="width: 50%"><a href="/tags/大数据">大数据</a></li>
</ul>
</nav>
</header>
</div>
<div class="mobile-mask" style="display:none" q-show="isShow"></div>
</nav>
<div id="wrapper" class="body-wrap">
<div class="menu-l">
<div class="canvas-wrap">
<canvas data-colors="#eaeaea" data-sectionHeight="100" data-contentId="js-content" id="myCanvas1" class="anm-canvas"></canvas>
</div>
<div id="js-content" class="content-ll">
<article id="post-2018-12-26-my2018" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2018/12/26/2018-12-26-my2018/">我的2018</a>
</h1>
<a href="/2018/12/26/2018-12-26-my2018/" class="archive-article-date">
<time datetime="2018-12-26T15:55:47.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2018-12-26</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p><img src="/images/river.jpg" width="75%" height="75%"></p>
<p>2018年,我想通了一些事情,于是我不再那么焦虑了,甚至有些开心。</p>
<p>我降低了对自己的要求,包袱不再那么重,不再妄图做圣人,用变态的规则约束自己,更加遵从自己的内心从事;</p>
<p>我降低了对他人的要求,学会了宽容,理解每个人的局限,对团队的小小进步保持欣喜;</p>
<p>我学会了关心和照顾他人,像一个成年人那样,这是我早该学会的;</p>
<p>我逐渐摆脱了酒精对我的吸引力,更自由更放松;有规律地健身,更有力量,更健康。</p>
<p>新的一年即将到来,不立flag,不写目标。就这样保持下去挺好。</p>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color3">生活</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2018/12/26/2018-12-26-my2018/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2018-12-25-adaptive_exec" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2018/12/25/2018-12-25-adaptive_exec/">SparkSQL Adaptive Execution 简介</a>
</h1>
<a href="/2018/12/25/2018-12-25-adaptive_exec/" class="archive-article-date">
<time datetime="2018-12-25T01:55:00.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2018-12-25</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h1 id="SparkSQL-Adaptive-Execution-简介"><a href="#SparkSQL-Adaptive-Execution-简介" class="headerlink" title="SparkSQL Adaptive Execution 简介"></a>SparkSQL Adaptive Execution 简介</h1><p>Spark SQL 拥有友好的编程接口和卓越的性能,广受大数据处理从业者的欢迎。但是在典型的shuffle场景比如SQL Join场景中,Spark SQL仍然遭遇了不少性能和稳定性问题。</p>
<p>为此,社区和Intel/Baidu的同学先后两次提出了Adaptive Execution的修改议案。本博客将就相关修改做出说明。</p>
<p>两次修改的JIRA:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-9850" target="_blank" rel="noopener">https://issues.apache.org/jira/browse/SPARK-9850</a> 2015年提出。</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-23128" target="_blank" rel="noopener">https://issues.apache.org/jira/browse/SPARK-23128</a> 2018年提出。</li>
</ul>
<p>由于第一次的修改自从2016年后就没有实质进展,本文以第二次修改提案为准。</p>
<p>本文假设读者对Spark的基本运行原理已经有所了解。</p>
<h2 id="当前存在的问题"><a href="#当前存在的问题" class="headerlink" title="当前存在的问题"></a>当前存在的问题</h2><p>我们先想一想,除了数据量大之外,Spark SQL处理数据的特点是什么呢?</p>
<ul>
<li>多是第一手数据。毕竟Spark往往是在做ETL,是数据加工的第一站;</li>
<li>数据来源丰富。一般是来自于Kafka,HDFS,Hive。也可能来自于HBase和RMDB。</li>
<li>迭代计算过程可以很复杂,也可能结合较多的业务相关UDF。</li>
</ul>
<p>这些特点带来的问题是:</p>
<ul>
<li>数据质量不高,容易有数据倾斜等问题;</li>
<li>Spark本身不挑数据源,反过来讲,spark是从各种数据源加载数据,本身对于索引,数据分布状况等信息不敏感(虽然也有简单的CBO机制);</li>
<li>数据经过处理后的输出情况可能很复杂,可能数据量比预期的小很多,也可能膨胀很多。</li>
</ul>
<p>后两点带来的共同问题是:Spark在计算前难以准确估计加工后的数据大小,会给执行计划的优化带来很多困难。也就是说,虽然Spark SQL在map阶段读入数据时,还能够以一种相对还算的平均(大体平均,具体和文件数目大小相关)切分数据的方式划分任务集,但是一旦投入计算后,经历shuffle阶段时,就容易出现一些任务划分不合理或者说是性能上的瓶颈。</p>
<p>解决这些问题需要Spark SQL具备根据计算过程中的数据输出情况做动态调整,这就是AE的”Adaptive”的由来。下面我们就来看AE实际解决的具体问题。</p>
<a class="article-more-a" href="/2018/12/25/2018-12-25-adaptive_exec/#more">more >></a>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color1">Spark</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2018/12/25/2018-12-25-adaptive_exec/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2018-11-19-全球运维技术大会" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2018/11/19/2018-11-19-全球运维技术大会/">全球运维技术大会参会报告</a>
</h1>
<a href="/2018/11/19/2018-11-19-全球运维技术大会/" class="archive-article-date">
<time datetime="2018-11-19T03:00:00.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2018-11-19</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>今日参加了全球运维技术大会。本文是参会作业。</p>
<p><a href="https://cnutcon2018.geekbang.org/" target="_blank" rel="noopener">https://cnutcon2018.geekbang.org/</a></p>
<p><img src="/images/cnutcon.png" alt="CNUTCON"></p>
<h2 id="听运维大会我听什么"><a href="#听运维大会我听什么" class="headerlink" title="听运维大会我听什么"></a>听运维大会我听什么</h2><p>我的关注点:</p>
<ul>
<li>1)运维技术的整体态势;</li>
<li>2)业界其他公司在运维方面的组织架构;</li>
<li>3)具体技术比如监控、日志,容器环境等的技术选型。</li>
<li>4)大公司复杂系统的构建、部署过程。</li>
</ul>
<p>二 运维技术的整体态势</p>
<p>听完两天的会议内容后我手绘一张图,展现了整体的趋势:</p>
<p><img src="/images/mine.png" width="55%" height="55%"></p>
<a class="article-more-a" href="/2018/11/19/2018-11-19-全球运维技术大会/#more">more >></a>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color3">运维</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2018/11/19/2018-11-19-全球运维技术大会/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2018-09-29-基于Hadoop和Spark的企业级大数据平台实践2" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2018/09/29/2018-09-29-基于Hadoop和Spark的企业级大数据平台实践2/">基于Hadoop和Spark的企业级大数据平台实践之二--租户和安全认证</a>
</h1>
<a href="/2018/09/29/2018-09-29-基于Hadoop和Spark的企业级大数据平台实践2/" class="archive-article-date">
<time datetime="2018-09-29T01:55:00.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2018-09-29</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>之前在 <a href="/2017/03/29/2017-03-29-基于Hadoop和Spark的企业级大数据平台实践/" title="[基于Hadoop和Spark的企业级大数据平台实践1]">[基于Hadoop和Spark的企业级大数据平台实践1]</a> 中简单介绍了我们数据产品首个版本的最基本架构。本文是该系列第二篇,重点介绍关于大数据平台的多租户和安全建设。</p>
<h2 id="租户划分"><a href="#租户划分" class="headerlink" title="租户划分"></a>租户划分</h2><p>租户的划分可以有不同的粒度。</p>
<ul>
<li>在云上,一种最简单粗暴的租户划分方式是每用户一个虚拟集群。这也是一种最省心的划分方式。借助于云基础设施的虚拟化能力,创建不同的虚拟集群,可以做到完美的隔离;</li>
<li>另一条路是像MaxCompute这样的产品,单一集群要为阿里巴巴集团无数人员提供服务,租户限定到具体project,每project内有非常细粒度的权限控制。</li>
</ul>
<p>我们产品暂时对于云没有那么强的亲和性(当然后续我们也要拥抱云),而且我们不只是提供最基本的平台组件,而是要基于平台生长出更多大数据能力,所以我们在租户定义上更多参考了MaxCompute的设计。</p>
<p>我们的租户划分有两个维度:</p>
<ul>
<li>计算资源</li>
<li>工作空间</li>
</ul>
<a class="article-more-a" href="/2018/09/29/2018-09-29-基于Hadoop和Spark的企业级大数据平台实践2/#more">more >></a>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2018/09/29/2018-09-29-基于Hadoop和Spark的企业级大数据平台实践2/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2017-03-29-基于Hadoop和Spark的企业级大数据平台实践" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2017/03/29/2017-03-29-基于Hadoop和Spark的企业级大数据平台实践/">基于Spark 的企业级大数据平台建设实践(一)</a>
</h1>
<a href="/2017/03/29/2017-03-29-基于Hadoop和Spark的企业级大数据平台实践/" class="archive-article-date">
<time datetime="2017-03-29T01:55:00.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2017-03-29</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<p>本文简述我司大数据产品的早期建设过程。我们基于Hadoop和Spark,提供了和Hive相同界面的SQL引擎,完整的数据交换链路,以及带图形化的大数据工作平台。</p>
<p>本篇是系列的第一篇——离线计算系统的建立,重点放在SQL上。</p>
<h2 id="需求和场景"><a href="#需求和场景" class="headerlink" title="需求和场景"></a>需求和场景</h2><p>我们是一家不大不小的创业公司,缺少互联网大厂的人力物力资源和业务场景,在技术上要求有如下特点。</p>
<ul>
<li>满足大数据应用的需求,可能会来自于不同行业,定制化程度高;</li>
<li>平台开放性,更容易形成合作。要同时考虑开源生态圈和不同行业ISV;</li>
<li>小型化,弹性扩容;</li>
<li>便于输出,向非技术背景客户提供助力。特别要求部署简单,使用门槛尽可能低;</li>
</ul>
<a class="article-more-a" href="/2017/03/29/2017-03-29-基于Hadoop和Spark的企业级大数据平台实践/#more">more >></a>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2017/03/29/2017-03-29-基于Hadoop和Spark的企业级大数据平台实践/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2017-03-14-python-spark" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2017/03/14/2017-03-14-python-spark/">Python On Spark</a>
</h1>
<a href="/2017/03/14/2017-03-14-python-spark/" class="archive-article-date">
<time datetime="2017-03-14T05:02:16.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2017-03-14</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h2 id="Python-数据分析人员的热门语言"><a href="#Python-数据分析人员的热门语言" class="headerlink" title="Python: 数据分析人员的热门语言"></a>Python: 数据分析人员的热门语言</h2><p>如果你关心数据圈子,你会发现大部分的数据分析人员现在都会点python。</p>
<p>Python是一种非常友好的语言,虽然对于很多程序员而言无类型让人觉得缺少安全感,但是python的简洁和优雅还是让很多人还是大声喊出了: Life is short, use python!</p>
<p>Spark在设计上就考虑了多语言的支持,除了Scala之外,也提供了java和python的编程接口。</p>
<p>Python On Spark有两种执行方式,一种是pySpark,这是Spark提供的类似于python shell的界面,用户可以通过它交互执行python代码操作数据。另一种方式是用户给定py脚本,提交运行。<br>
<a class="article-more-a" href="/2017/03/14/2017-03-14-python-spark/#more">more >></a>
</p></div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color1">Spark</a>
</li>
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color2">python</a>
</li>
</ul>
</div>
<div class="article-category tagcloud">
<i class="icon-book icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="/categories/大数据//" class="article-tag-list-link color4">大数据</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2017/03/14/2017-03-14-python-spark/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2017-03-03-正则之锅-生产环境中的spark慢任务实例" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2017/03/06/2017-03-03-正则之锅-生产环境中的spark慢任务实例/">正则表达式之锅 —— 一个生产环境中的spark慢任务实例</a>
</h1>
<a href="/2017/03/06/2017-03-03-正则之锅-生产环境中的spark慢任务实例/" class="archive-article-date">
<time datetime="2017-03-06T08:01:46.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2017-03-06</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h2 id="慢任务"><a href="#慢任务" class="headerlink" title="慢任务"></a>慢任务</h2><p>那天我们的安全数据分析员小Z告诉我,他的一个计算量并不大的Spark SQL作业跑了3个小时还没有结束(最终是6小时后运行结束),很不正常。</p>
<p>我看了一下,基本现象如下:</p>
<p>当前的stage的任务规模是59,58个任务早已结束(分钟级别),只剩一个在运行;</p>
<p><img style="mergin:5px;" src="/images/task-long59.PNG"></p>
<a class="article-more-a" href="/2017/03/06/2017-03-03-正则之锅-生产环境中的spark慢任务实例/#more">more >></a>
</div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color1">Spark</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2017/03/06/2017-03-03-正则之锅-生产环境中的spark慢任务实例/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2016-07-26-java_oom" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2016/07/26/2016-07-26-java_oom/">JAVA OOM问题的定位</a>
</h1>
<a href="/2016/07/26/2016-07-26-java_oom/" class="archive-article-date">
<time datetime="2016-07-26T08:01:46.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2016-07-26</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h1 id="JAVA-OOM-问题的定位"><a href="#JAVA-OOM-问题的定位" class="headerlink" title="JAVA OOM 问题的定位"></a>JAVA OOM 问题的定位</h1><p>那天,我们的spark集群又出了幺蛾子,一条SQL导致了大面积的Executor OOM。</p>
<p>OOM 就是OutOfMemory,出现这个信息的后果通常都很严重。如果确实是内存不够倒也还罢了,通过配置参数调调内存也就好了。就怕是程序有bug,比如内存泄露之类,那就需要花点功夫去定位了。</p>
<p>幸运的是,Java是一门非常易用的语言,内存问题定位起来相对简单。下面我们就用这个问题,展示一下JAVA OOM问题的定位吧。<br>
<a class="article-more-a" href="/2016/07/26/2016-07-26-java_oom/#more">more >></a>
</p></div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color5">Java</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2016/07/26/2016-07-26-java_oom/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2016-06-06-Spark_load_json" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2016/06/06/2016-06-06-Spark_load_json/">Spark 从JSON加载数据</a>
</h1>
<a href="/2016/06/06/2016-06-06-Spark_load_json/" class="archive-article-date">
<time datetime="2016-06-06T08:01:46.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2016-06-06</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h1 id="spark-load-json"><a href="#spark-load-json" class="headerlink" title="spark load json"></a>spark load json</h1><p>JSON(JavaScript Object Notation) 是一种轻量级的数据交换格式,在互联网公司尤为常见。但是JSON并不能视为一种严格的结构化数据,要想导入其他系统如Hive需要一些额外的解析转换的编码工作。好在我们现在有Spark,它的DataFrame接口让我们可以更方便地处理JSON格式的数据。</p>
<p>本文将以scala代码展示Spark 1.6.1上如何加载并处理JSON格式的数据。<br>
<a class="article-more-a" href="/2016/06/06/2016-06-06-Spark_load_json/#more">more >></a>
</p></div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color1">Spark</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2016/06/06/2016-06-06-Spark_load_json/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<article id="post-2016-03-02-Spark_deploy" class="article article-type-post article-index" itemscope="" itemprop="blogPost">
<div class="article-inner">
<header class="article-header">
<h1 itemprop="name">
<a class="article-title" href="/2016/03/02/2016-03-02-Spark_deploy/">Spark on Ubuntu</a>
</h1>
<a href="/2016/03/02/2016-03-02-Spark_deploy/" class="archive-article-date">
<time datetime="2016-03-02T07:11:46.000Z" itemprop="datePublished"><i class="icon-calendar icon"></i>2016-03-02</time>
</a>
</header>
<div class="article-entry" itemprop="articleBody">
<h1 id="Spark-deploy"><a href="#Spark-deploy" class="headerlink" title="Spark deploy"></a>Spark deploy</h1><p>前一篇文中我们部署了一个具备HDFS HA和YARN HA的Hadoop小集群。</p>
<p>这次我们在这套环境上继续部署一套Spark引擎。部署过程相当简单。<br>
<a class="article-more-a" href="/2016/03/02/2016-03-02-Spark_deploy/#more">more >></a>
</p></div>
<div class="article-info article-info-index">
<div class="article-tag tagcloud">
<i class="icon-price-tags icon"></i>
<ul class="article-tag-list">
<li class="article-tag-list-item">
<a href="javascript:void(0)" class="js-tag article-tag-list-link color4">大数据</a>
</li>
</ul>
</div>
<p class="article-more-link">
<a class="article-more-a" href="/2016/03/02/2016-03-02-Spark_deploy/">展开全文 >></a>
</p>
<div class="clearfix"></div>
</div>
</div>
</article>
<aside class="wrap-side-operation">
<div class="mod-side-operation">
<div class="jump-container" id="js-jump-container" style="display:none;">
<a href="javascript:void(0)" class="mod-side-operation__jump-to-top">
<i class="icon-font icon-back"></i>
</a>
<div id="js-jump-plan-container" class="jump-plan-container" style="top: -11px;">
<i class="icon-font icon-plane jump-plane"></i>
</div>
</div>
</div>
</aside>
<nav id="page-nav">
<span class="page-number current">1</span><a class="page-number" href="/page/2/">2</a><a class="extend next" rel="next" href="/page/2/">Next »</a>
</nav>