From 854c19a151b36794675231147efb490ebe702dab Mon Sep 17 00:00:00 2001 From: Aaron Patterson Date: Mon, 3 May 2021 18:23:24 -0700 Subject: [PATCH 01/15] Cast these frame objects to ints This way we don't generate object ids or finalizers every time we're sampling the application --- ext/stackprof/stackprof.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index b1dfd864..ac267683 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -209,13 +209,19 @@ stackprof_stop(VALUE self) return Qtrue; } +#if SIZEOF_VOIDP == SIZEOF_LONG +# define PTR2NUM(x) (LONG2NUM((long)(x))) +#else +# define PTR2NUM(x) (LL2NUM((LONG_LONG)(x))) +#endif + static int frame_edges_i(st_data_t key, st_data_t val, st_data_t arg) { VALUE edges = (VALUE)arg; intptr_t weight = (intptr_t)val; - rb_hash_aset(edges, rb_obj_id((VALUE)key), INT2FIX(weight)); + rb_hash_aset(edges, PTR2NUM(key), INT2FIX(weight)); return ST_CONTINUE; } @@ -242,7 +248,7 @@ frame_i(st_data_t key, st_data_t val, st_data_t arg) VALUE name, file, edges, lines; VALUE line; - rb_hash_aset(results, rb_obj_id(frame), details); + rb_hash_aset(results, PTR2NUM(frame), details); if (FIXNUM_P(frame)) { name = _stackprof.fake_frame_names[FIX2INT(frame)]; @@ -322,7 +328,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self) rb_ary_push(raw_samples, SIZET2NUM(len)); for (o = 0, n++; o < len; n++, o++) - rb_ary_push(raw_samples, rb_obj_id(_stackprof.raw_samples[n])); + rb_ary_push(raw_samples, PTR2NUM(_stackprof.raw_samples[n])); rb_ary_push(raw_samples, SIZET2NUM((size_t)_stackprof.raw_samples[n])); } From 68ec8be33e02aca9db7545f81fdcdbcf093008b1 Mon Sep 17 00:00:00 2001 From: Masataka Pocke Kuwabara Date: Fri, 7 May 2021 01:04:25 +0900 Subject: [PATCH 02/15] Improve performance of `--d3-flamegraph` option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change improves performance of `--d3-flamegraph` option. Problem ==== `--d3-flamegraph` option is super slow if the stackprof's output file is large. With a 28MB file, it takes 198 sec. ```bash $ du -h stackprof-out 28M stackprof-out $ stackprof --d3-flamegraph stackprof-out > /dev/null # it takes 192.12s ``` Profiling === I profiled it with stackprof. Slicing array is the bottle neck. ``` $ stackprof stackprof-out | head ================================== Mode: wall(1000) Samples: 80265 (61.03% miss rate) GC: 381 (0.47%) ================================== TOTAL (pct) SAMPLES (pct) FRAME 79877 (99.5%) 78969 (98.4%) StackProf::Report#print_d3_flamegraph 902 (1.1%) 902 (1.1%) StackProf::Report#convert_to_d3_flame_graph_format 338 (0.4%) 338 (0.4%) (marking) 42 (0.1%) 42 (0.1%) (sweeping) $ stackprof stackprof-out -m print_d3_flamegraph StackProf::Report#print_d3_flamegraph (/home/pocke/.rbenv/versions/2.7.1/lib/ruby/gems/2.7.0/gems/stackprof-0.2.16/lib/stackprof/report.rb:231) samples: 78969 self (98.4%) / 79877 total (99.5%) callers: 79877 ( 100.0%) callees (908 total): 902 ( 99.3%) StackProf::Report#convert_to_d3_flame_graph_format 5 ( 0.6%) Kernel#require 1 ( 0.1%) #.generate code: | 231 | def print_d3_flamegraph(f=STDOUT, skip_common=true) | 232 | raise "profile does not include raw samples (add `raw: true` to collecting StackProf.run)" unless raw = data[:raw] | 233 | | 234 | stacks = [] | 235 | max_x = 0 | 236 | max_y = 0 27684 (34.5%) / 27684 (34.5%) | 237 | while len = raw.shift 4 (0.0%) / 4 (0.0%) | 238 | max_y = len if len > max_y 50020 (62.3%) / 50020 (62.3%) | 239 | stack = raw.slice!(0, len+1) | 240 | stacks << stack 64 (0.1%) / 64 (0.1%) | 241 | max_x += stack.last | 242 | end | 243 | | 244 | # d3-flame-grpah supports only alphabetical flamegraph 1195 (1.5%) / 1195 (1.5%) | 245 | stacks.sort! | 246 | 5 (0.0%) | 247 | require "json" 903 (1.1%) | 248 | json = JSON.generate(convert_to_d3_flame_graph_format("", stacks, 0), max_nesting: false) | 249 | | 250 | # This html code is almost copied from d3-flame-graph sample code. | 251 | # (Apache License 2.0) | 252 | # https://github.com/spiermar/d3-flame-graph/blob/gh-pages/index.html | 253 | 2 (0.0%) / 2 (0.0%) | 254 | f.print <<-END | 255 | ``` Solution === Do not mutate the array, but use an index for slicing instead. With this change, `stackprof --d3-flamegraph` command with the same stackprof's output takes only 2.33 sec, 82x faster🚀 --- lib/stackprof/report.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/stackprof/report.rb b/lib/stackprof/report.rb index 895f4627..7adc7e1a 100644 --- a/lib/stackprof/report.rb +++ b/lib/stackprof/report.rb @@ -234,9 +234,13 @@ def print_d3_flamegraph(f=STDOUT, skip_common=true) stacks = [] max_x = 0 max_y = 0 - while len = raw.shift + idx = 0 + + while len = raw[idx] + idx += 1 max_y = len if len > max_y - stack = raw.slice!(0, len+1) + stack = raw.slice(idx, len+1) + idx += len+1 stacks << stack max_x += stack.last end From fac668c93c20eaa2269bdd7f1e24ed965847a8ac Mon Sep 17 00:00:00 2001 From: Masataka Pocke Kuwabara Date: Fri, 7 May 2021 19:36:18 +0900 Subject: [PATCH 03/15] Extract `flamegraph_stacks` for print_flamegraph and print_d3_flamegraph --- lib/stackprof/report.rb | 77 +++++++++++------------------------------ 1 file changed, 21 insertions(+), 56 deletions(-) diff --git a/lib/stackprof/report.rb b/lib/stackprof/report.rb index 7adc7e1a..9b80f8ba 100644 --- a/lib/stackprof/report.rb +++ b/lib/stackprof/report.rb @@ -95,51 +95,10 @@ def print_alphabetical_flamegraph(f=STDOUT, skip_common=true) print_flamegraph(f, skip_common, true) end - StackCursor = Struct.new(:raw, :idx, :length) do - def weight - @weight ||= raw[1 + idx + length] - end - - def [](i) - if i >= length - nil - else - raw[1 + idx + i] - end - end - - def <=>(other) - i = 0 - while i < length && i < other.length - if self[i] != other[i] - return self[i] <=> other[i] - end - i += 1 - end - - return length <=> other.length - end - end - def print_flamegraph(f, skip_common, alphabetical=false) raise "profile does not include raw samples (add `raw: true` to collecting StackProf.run)" unless raw = data[:raw] - stacks = [] - max_x = 0 - max_y = 0 - - idx = 0 - loop do - len = raw[idx] - break unless len - max_y = len if len > max_y - - stack = StackCursor.new(raw, idx, len) - stacks << stack - max_x += stack.weight - - idx += len + 2 - end + stacks, max_x, max_y = flamegraph_stacks(raw) stacks.sort! if alphabetical @@ -150,7 +109,7 @@ def print_flamegraph(f, skip_common, alphabetical=false) x = 0 stacks.each do |stack| - weight = stack.weight + weight = stack.last cell = stack[y] unless y == stack.length-1 if cell.nil? @@ -191,6 +150,24 @@ def print_flamegraph(f, skip_common, alphabetical=false) f.puts '])' end + def flamegraph_stacks(raw) + stacks = [] + max_x = 0 + max_y = 0 + idx = 0 + + while len = raw[idx] + idx += 1 + max_y = len if len > max_y + stack = raw.slice(idx, len+1) + idx += len+1 + stacks << stack + max_x += stack.last + end + + return stacks, max_x, max_y + end + def flamegraph_row(f, x, y, weight, addr) frame = @data[:frames][addr] f.print ',' if @rows_started @@ -231,19 +208,7 @@ def convert_to_d3_flame_graph_format(name, stacks, depth) def print_d3_flamegraph(f=STDOUT, skip_common=true) raise "profile does not include raw samples (add `raw: true` to collecting StackProf.run)" unless raw = data[:raw] - stacks = [] - max_x = 0 - max_y = 0 - idx = 0 - - while len = raw[idx] - idx += 1 - max_y = len if len > max_y - stack = raw.slice(idx, len+1) - idx += len+1 - stacks << stack - max_x += stack.last - end + stacks, * = flamegraph_stacks(raw) # d3-flame-grpah supports only alphabetical flamegraph stacks.sort! From 3fb219c4c24bdd6807a7afbcaefc4c22f6f45073 Mon Sep 17 00:00:00 2001 From: David Judd Date: Mon, 5 Jul 2021 16:16:47 -0700 Subject: [PATCH 04/15] Use monotonic clock when available --- ext/stackprof/stackprof.c | 72 ++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 20 deletions(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index ac267683..98d48572 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -13,10 +13,12 @@ #include #include #include +#include #include #define BUF_SIZE 2048 #define MICROSECONDS_IN_SECOND 1000000 +#define NANOSECONDS_IN_SECOND 1000000000 #define FAKE_FRAME_GC INT2FIX(0) #define FAKE_FRAME_MARK INT2FIX(1) @@ -38,6 +40,39 @@ static const char *fake_frame_cstrs[] = { #define TOTAL_FAKE_FRAMES (sizeof(fake_frame_cstrs) / sizeof(char *)) +#ifdef _POSIX_MONOTONIC_CLOCK + #define timestamp_t timespec + typedef struct timestamp_t timestamp_t; + + static void capture_timestamp(timestamp_t *ts) { + clock_gettime(CLOCK_MONOTONIC, ts); + } + + static int64_t delta_usec(timestamp_t *start, timestamp_t *end) { + int64_t result = 1000 * (end->tv_sec - start->tv_sec); + if (end->tv_nsec < start->tv_nsec) { + result -= 1000; + result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000; + } else { + result += (end->tv_nsec - start->tv_nsec) / 1000; + } + return result; + } +#else + #define timestamp_t timeval + typedef struct timestamp_t timestamp_t; + + static void capture_timestamp(timestamp_t *ts) { + gettimeofday(ts, NULL); + } + + static int64_t delta_usec(timestamp_t *start, timestamp_t *end) { + struct timeval diff; + timersub(end, start, &diff); + return (1000 * diff.tv_sec) + diff.tv_usec; + } +#endif + typedef struct { size_t total_samples; size_t caller_samples; @@ -62,7 +97,7 @@ static struct { size_t raw_samples_capa; size_t raw_sample_index; - struct timeval last_sample_at; + struct timestamp_t last_sample_at; int *raw_timestamp_deltas; size_t raw_timestamp_deltas_len; size_t raw_timestamp_deltas_capa; @@ -174,7 +209,7 @@ stackprof_start(int argc, VALUE *argv, VALUE self) _stackprof.out = out; if (raw) { - gettimeofday(&_stackprof.last_sample_at, NULL); + capture_timestamp(&_stackprof.last_sample_at); } return Qtrue; @@ -430,7 +465,7 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment) } void -stackprof_record_sample_for_stack(int num, int timestamp_delta) +stackprof_record_sample_for_stack(int num, int64_t timestamp_delta) { int i, n; VALUE prev_frame = Qnil; @@ -501,8 +536,9 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta) _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa); } - /* Store the time delta (which is the amount of time between samples) */ - _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = timestamp_delta; + /* Store the time delta (which is the amount of microseconds between samples). + * Since the sampling interval can't be more than a second, this should be safe to cast. */ + _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = (int) timestamp_delta; } for (i = 0; i < num; i++) { @@ -535,21 +571,19 @@ stackprof_record_sample_for_stack(int num, int timestamp_delta) } if (_stackprof.raw) { - gettimeofday(&_stackprof.last_sample_at, NULL); + capture_timestamp(&_stackprof.last_sample_at); } } void stackprof_record_sample() { - int timestamp_delta = 0; + int64_t timestamp_delta = 0; int num; if (_stackprof.raw) { - struct timeval t; - struct timeval diff; - gettimeofday(&t, NULL); - timersub(&t, &_stackprof.last_sample_at, &diff); - timestamp_delta = (1000 * diff.tv_sec) + diff.tv_usec; + struct timestamp_t t; + capture_timestamp(&t); + timestamp_delta = delta_usec(&t, &_stackprof.last_sample_at); } num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer); stackprof_record_sample_for_stack(num, timestamp_delta); @@ -558,17 +592,15 @@ stackprof_record_sample() void stackprof_record_gc_samples() { - int delta_to_first_unrecorded_gc_sample = 0; - int i; + int64_t delta_to_first_unrecorded_gc_sample = 0; + size_t i; if (_stackprof.raw) { - struct timeval t; - struct timeval diff; - gettimeofday(&t, NULL); - timersub(&t, &_stackprof.last_sample_at, &diff); + struct timestamp_t t; + capture_timestamp(&t); // We don't know when the GC samples were actually marked, so let's // assume that they were marked at a perfectly regular interval. - delta_to_first_unrecorded_gc_sample = (1000 * diff.tv_sec + diff.tv_usec) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval); + delta_to_first_unrecorded_gc_sample = delta_usec(&t, &_stackprof.last_sample_at) - (_stackprof.unrecorded_gc_samples - 1) * NUM2LONG(_stackprof.interval); if (delta_to_first_unrecorded_gc_sample < 0) { delta_to_first_unrecorded_gc_sample = 0; } @@ -576,7 +608,7 @@ stackprof_record_gc_samples() for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) { - int timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval); + int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval); if (_stackprof.unrecorded_gc_marking_samples) { _stackprof.frames_buffer[0] = FAKE_FRAME_MARK; From d730dd15737042f0ebfc817a42e52569a9bc11ef Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Fri, 27 Aug 2021 11:48:06 +0200 Subject: [PATCH 05/15] Fix timestamp math for platforms without _POSIX_MONOTONIC_CLOCK Pretty much a rebase https://github.com/tmm1/stackprof/pull/122 --- ext/stackprof/stackprof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index 98d48572..06b40679 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -69,7 +69,7 @@ static const char *fake_frame_cstrs[] = { static int64_t delta_usec(timestamp_t *start, timestamp_t *end) { struct timeval diff; timersub(end, start, &diff); - return (1000 * diff.tv_sec) + diff.tv_usec; + return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec; } #endif From 1b607bc263a9162cf4fb51c28d1e138e6313fac9 Mon Sep 17 00:00:00 2001 From: Josh Glendenning Date: Sat, 28 Aug 2021 16:26:03 -0500 Subject: [PATCH 06/15] Make test_cputime more reliable It seems the order of the frames hash values in test_cputime is somewhat nondeterministic, at least with Ruby > 3. Most of the time the "StackProfTest#math" frame will be at index 1, but sometimes it shows up at index 0, like it does with Ruby < 3. This commit updates the test to accept it in either position. --- test/test_stackprof.rb | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/test_stackprof.rb b/test/test_stackprof.rb index 8df46c6d..efb5a007 100644 --- a/test/test_stackprof.rb +++ b/test/test_stackprof.rb @@ -78,9 +78,14 @@ def test_cputime end assert_operator profile[:samples], :>=, 1 - offset = RUBY_VERSION >= '3' ? 1 : 0 - frame = profile[:frames].values[offset] - assert_includes frame[:name], "StackProfTest#math" + if RUBY_VERSION >= '3' + assert profile[:frames].values.take(2).map { |f| + f[:name].include? "StackProfTest#math" + }.any? + else + frame = profile[:frames].values.first + assert_includes frame[:name], "StackProfTest#math" + end end def test_walltime From 4f7ee11d5f78870732b7ee6c92053e3432833655 Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Tue, 7 Sep 2021 11:20:48 +0100 Subject: [PATCH 07/15] Fix timestamp math for platforms with _POSIX_MONOTONIC_CLOCK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hello, hello! 🎉 First the disclaimer (I guess it may be relevant): I work at @DataDog on profiling for the [ddtrace](https://github.com/DataDog/dd-trace-rb) gem, although we don't use `rb_profile_frames` (but it is quite interesting). I noticed that the fix for #122 in #163 actually missed the new `_POSIX_MONOTONIC_CLOCK` branch, where the math was still incorrect (used 1000 instead of 1000000 when attempting to convert seconds to microseconds). cc @djudd @casperisfine --- ext/stackprof/stackprof.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index 06b40679..ffeb7911 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -49,9 +49,9 @@ static const char *fake_frame_cstrs[] = { } static int64_t delta_usec(timestamp_t *start, timestamp_t *end) { - int64_t result = 1000 * (end->tv_sec - start->tv_sec); + int64_t result = MICROSECONDS_IN_SECOND * (end->tv_sec - start->tv_sec); if (end->tv_nsec < start->tv_nsec) { - result -= 1000; + result -= MICROSECONDS_IN_SECOND; result += (NANOSECONDS_IN_SECOND + end->tv_nsec - start->tv_nsec) / 1000; } else { result += (end->tv_nsec - start->tv_nsec) / 1000; From c859db92a3a563320eca2445999f7916d1af5211 Mon Sep 17 00:00:00 2001 From: Josh Glendenning Date: Thu, 26 Aug 2021 13:33:38 -0500 Subject: [PATCH 08/15] Fix infinite realloc loop when rb_profile_frames returns 0 Sometimes rb_profile_frames returns 0. If this happens before raw_samples is allocated for the first time, stackprof_record_sample_for_stack can enter an infinite loop as it tries to realloc raw_samples by doubling raw_samples_capa, which is 0 when num is 0. --- ext/stackprof/stackprof.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index ffeb7911..9191276c 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -472,7 +472,7 @@ stackprof_record_sample_for_stack(int num, int64_t timestamp_delta) _stackprof.overall_samples++; - if (_stackprof.raw) { + if (_stackprof.raw && num > 0) { int found = 0; /* If there's no sample buffer allocated, then allocate one. The buffer From 930d198deb428db04178a4616afec6b52c9f8ba6 Mon Sep 17 00:00:00 2001 From: David Judd Date: Sun, 7 Nov 2021 20:51:37 -0800 Subject: [PATCH 09/15] Record sample timestamps in :raw mode --- ext/stackprof/stackprof.c | 84 ++++++++++++++++++++++++--------------- test/test_stackprof.rb | 14 +++++++ 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index ffeb7911..c50bc4ce 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -58,6 +58,10 @@ static const char *fake_frame_cstrs[] = { } return result; } + + static uint64_t timestamp_usec(timestamp_t *ts) { + return (MICROSECONDS_IN_SECOND * ts->tv_sec) + (ts->tv_nsec / 1000); + } #else #define timestamp_t timeval typedef struct timestamp_t timestamp_t; @@ -71,6 +75,10 @@ static const char *fake_frame_cstrs[] = { timersub(end, start, &diff); return (MICROSECONDS_IN_SECOND * diff.tv_sec) + diff.tv_usec; } + + static uint64_t timestamp_usec(timestamp_t *ts) { + return (MICROSECONDS_IN_SECOND * ts.tv_sec) + diff.tv_usec + } #endif typedef struct { @@ -81,6 +89,11 @@ typedef struct { st_table *lines; } frame_data_t; +typedef struct { + uint64_t timestamp_usec; + int64_t delta_usec; +} sample_time_t; + static struct { int running; int raw; @@ -98,9 +111,9 @@ static struct { size_t raw_sample_index; struct timestamp_t last_sample_at; - int *raw_timestamp_deltas; - size_t raw_timestamp_deltas_len; - size_t raw_timestamp_deltas_capa; + sample_time_t *raw_sample_times; + size_t raw_sample_times_len; + size_t raw_sample_times_capa; size_t overall_signals; size_t overall_samples; @@ -119,7 +132,7 @@ static struct { static VALUE sym_object, sym_wall, sym_cpu, sym_custom, sym_name, sym_file, sym_line; static VALUE sym_samples, sym_total_samples, sym_missed_samples, sym_edges, sym_lines; static VALUE sym_version, sym_mode, sym_interval, sym_raw, sym_metadata, sym_frames, sym_ignore_gc, sym_out; -static VALUE sym_aggregate, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping; +static VALUE sym_aggregate, sym_raw_sample_timestamps, sym_raw_timestamp_deltas, sym_state, sym_marking, sym_sweeping; static VALUE sym_gc_samples, objtracer; static VALUE gc_hook; static VALUE rb_mStackProf; @@ -355,7 +368,7 @@ stackprof_results(int argc, VALUE *argv, VALUE self) if (_stackprof.raw && _stackprof.raw_samples_len) { size_t len, n, o; - VALUE raw_timestamp_deltas; + VALUE raw_sample_timestamps, raw_timestamp_deltas; VALUE raw_samples = rb_ary_new_capa(_stackprof.raw_samples_len); for (n = 0; n < _stackprof.raw_samples_len; n++) { @@ -375,17 +388,20 @@ stackprof_results(int argc, VALUE *argv, VALUE self) rb_hash_aset(results, sym_raw, raw_samples); - raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_timestamp_deltas_len); + raw_sample_timestamps = rb_ary_new_capa(_stackprof.raw_sample_times_len); + raw_timestamp_deltas = rb_ary_new_capa(_stackprof.raw_sample_times_len); - for (n = 0; n < _stackprof.raw_timestamp_deltas_len; n++) { - rb_ary_push(raw_timestamp_deltas, INT2FIX(_stackprof.raw_timestamp_deltas[n])); + for (n = 0; n < _stackprof.raw_sample_times_len; n++) { + rb_ary_push(raw_sample_timestamps, ULL2NUM(_stackprof.raw_sample_times[n].timestamp_usec)); + rb_ary_push(raw_timestamp_deltas, LL2NUM(_stackprof.raw_sample_times[n].delta_usec)); } - free(_stackprof.raw_timestamp_deltas); - _stackprof.raw_timestamp_deltas = NULL; - _stackprof.raw_timestamp_deltas_len = 0; - _stackprof.raw_timestamp_deltas_capa = 0; + free(_stackprof.raw_sample_times); + _stackprof.raw_sample_times = NULL; + _stackprof.raw_sample_times_len = 0; + _stackprof.raw_sample_times_capa = 0; + rb_hash_aset(results, sym_raw_sample_timestamps, raw_sample_timestamps); rb_hash_aset(results, sym_raw_timestamp_deltas, raw_timestamp_deltas); _stackprof.raw = 0; @@ -465,7 +481,7 @@ st_numtable_increment(st_table *table, st_data_t key, size_t increment) } void -stackprof_record_sample_for_stack(int num, int64_t timestamp_delta) +stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t timestamp_delta) { int i, n; VALUE prev_frame = Qnil; @@ -524,21 +540,23 @@ stackprof_record_sample_for_stack(int num, int64_t timestamp_delta) } /* If there's no timestamp delta buffer, allocate one */ - if (!_stackprof.raw_timestamp_deltas) { - _stackprof.raw_timestamp_deltas_capa = 100; - _stackprof.raw_timestamp_deltas = malloc(sizeof(int) * _stackprof.raw_timestamp_deltas_capa); - _stackprof.raw_timestamp_deltas_len = 0; + if (!_stackprof.raw_sample_times) { + _stackprof.raw_sample_times_capa = 100; + _stackprof.raw_sample_times = malloc(sizeof(sample_time_t) * _stackprof.raw_sample_times_capa); + _stackprof.raw_sample_times_len = 0; } /* Double the buffer size if it's too small */ - while (_stackprof.raw_timestamp_deltas_capa <= _stackprof.raw_timestamp_deltas_len + 1) { - _stackprof.raw_timestamp_deltas_capa *= 2; - _stackprof.raw_timestamp_deltas = realloc(_stackprof.raw_timestamp_deltas, sizeof(int) * _stackprof.raw_timestamp_deltas_capa); + while (_stackprof.raw_sample_times_capa <= _stackprof.raw_sample_times_len + 1) { + _stackprof.raw_sample_times_capa *= 2; + _stackprof.raw_sample_times = realloc(_stackprof.raw_sample_times, sizeof(sample_time_t) * _stackprof.raw_sample_times_capa); } - /* Store the time delta (which is the amount of microseconds between samples). - * Since the sampling interval can't be more than a second, this should be safe to cast. */ - _stackprof.raw_timestamp_deltas[_stackprof.raw_timestamp_deltas_len++] = (int) timestamp_delta; + /* Store the time delta (which is the amount of microseconds between samples). */ + _stackprof.raw_sample_times[_stackprof.raw_sample_times_len++] = (sample_time_t) { + .timestamp_usec = sample_timestamp, + .delta_usec = timestamp_delta, + }; } for (i = 0; i < num; i++) { @@ -578,25 +596,29 @@ stackprof_record_sample_for_stack(int num, int64_t timestamp_delta) void stackprof_record_sample() { + uint64_t start_timestamp = 0; int64_t timestamp_delta = 0; int num; if (_stackprof.raw) { struct timestamp_t t; capture_timestamp(&t); + start_timestamp = timestamp_usec(&t); timestamp_delta = delta_usec(&t, &_stackprof.last_sample_at); } num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer); - stackprof_record_sample_for_stack(num, timestamp_delta); + stackprof_record_sample_for_stack(num, start_timestamp, timestamp_delta); } void stackprof_record_gc_samples() { int64_t delta_to_first_unrecorded_gc_sample = 0; + uint64_t start_timestamp = 0; size_t i; if (_stackprof.raw) { struct timestamp_t t; capture_timestamp(&t); + start_timestamp = timestamp_usec(&t); // We don't know when the GC samples were actually marked, so let's // assume that they were marked at a perfectly regular interval. @@ -606,7 +628,6 @@ stackprof_record_gc_samples() } } - for (i = 0; i < _stackprof.unrecorded_gc_samples; i++) { int64_t timestamp_delta = i == 0 ? delta_to_first_unrecorded_gc_sample : NUM2LONG(_stackprof.interval); @@ -617,7 +638,7 @@ stackprof_record_gc_samples() _stackprof.lines_buffer[1] = 0; _stackprof.unrecorded_gc_marking_samples--; - stackprof_record_sample_for_stack(2, timestamp_delta); + stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta); } else if (_stackprof.unrecorded_gc_sweeping_samples) { _stackprof.frames_buffer[0] = FAKE_FRAME_SWEEP; _stackprof.lines_buffer[0] = 0; @@ -626,11 +647,11 @@ stackprof_record_gc_samples() _stackprof.unrecorded_gc_sweeping_samples--; - stackprof_record_sample_for_stack(2, timestamp_delta); + stackprof_record_sample_for_stack(2, start_timestamp, timestamp_delta); } else { _stackprof.frames_buffer[0] = FAKE_FRAME_GC; _stackprof.lines_buffer[0] = 0; - stackprof_record_sample_for_stack(1, timestamp_delta); + stackprof_record_sample_for_stack(1, start_timestamp, timestamp_delta); } } _stackprof.during_gc += _stackprof.unrecorded_gc_samples; @@ -780,6 +801,7 @@ Init_stackprof(void) S(mode); S(interval); S(raw); + S(raw_sample_timestamps); S(raw_timestamp_deltas); S(out); S(metadata); @@ -802,9 +824,9 @@ Init_stackprof(void) _stackprof.raw_samples_capa = 0; _stackprof.raw_sample_index = 0; - _stackprof.raw_timestamp_deltas = NULL; - _stackprof.raw_timestamp_deltas_len = 0; - _stackprof.raw_timestamp_deltas_capa = 0; + _stackprof.raw_sample_times = NULL; + _stackprof.raw_sample_times_len = 0; + _stackprof.raw_sample_times_capa = 0; _stackprof.empty_string = rb_str_new_cstr(""); rb_global_variable(&_stackprof.empty_string); diff --git a/test/test_stackprof.rb b/test/test_stackprof.rb index efb5a007..bb6e4201 100644 --- a/test/test_stackprof.rb +++ b/test/test_stackprof.rb @@ -126,19 +126,33 @@ def test_custom end def test_raw + before_monotonic = Process.clock_gettime(Process::CLOCK_MONOTONIC, :microsecond) + profile = StackProf.run(mode: :custom, raw: true) do 10.times do StackProf.sample end end + after_monotonic = Process.clock_gettime(Process::CLOCK_MONOTONIC, :microsecond) + raw = profile[:raw] assert_equal 10, raw[-1] assert_equal raw[0] + 2, raw.size offset = RUBY_VERSION >= '3' ? -3 : -2 assert_includes profile[:frames][raw[offset]][:name], 'StackProfTest#test_raw' + + assert_equal 10, profile[:raw_sample_timestamps].size + profile[:raw_sample_timestamps].each_cons(2) do |t1, t2| + assert_operator t1, :>, before_monotonic + assert_operator t2, :>=, t1 + assert_operator t2, :<, after_monotonic + end + assert_equal 10, profile[:raw_timestamp_deltas].size + total_duration = after_monotonic - before_monotonic + assert_operator profile[:raw_timestamp_deltas].sum, :<, total_duration end def test_metadata From cf58a9b7c5c5e0b78bad352d88348a8082533116 Mon Sep 17 00:00:00 2001 From: David Judd Date: Mon, 8 Nov 2021 09:28:36 -0800 Subject: [PATCH 10/15] Support Rubies without Array#sum in test --- test/test_stackprof.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_stackprof.rb b/test/test_stackprof.rb index bb6e4201..73c2a217 100644 --- a/test/test_stackprof.rb +++ b/test/test_stackprof.rb @@ -152,7 +152,7 @@ def test_raw assert_equal 10, profile[:raw_timestamp_deltas].size total_duration = after_monotonic - before_monotonic - assert_operator profile[:raw_timestamp_deltas].sum, :<, total_duration + assert_operator profile[:raw_timestamp_deltas].inject(&:+), :<, total_duration end def test_metadata From 4fc71fbf97ab8f1748ff04c4e09e43dc7dcc7ebb Mon Sep 17 00:00:00 2001 From: Maple Ong Date: Fri, 28 Jan 2022 11:56:50 -0500 Subject: [PATCH 11/15] Remove unused variable --- test/test_stackprof.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_stackprof.rb b/test/test_stackprof.rb index 73c2a217..4f12e806 100644 --- a/test/test_stackprof.rb +++ b/test/test_stackprof.rb @@ -224,7 +224,6 @@ def test_gc end end - raw = profile[:raw] gc_frame = profile[:frames].values.find{ |f| f[:name] == "(garbage collection)" } marking_frame = profile[:frames].values.find{ |f| f[:name] == "(marking)" } sweeping_frame = profile[:frames].values.find{ |f| f[:name] == "(sweeping)" } From b2f5f7af0434805c1e17bbfbb853639c25dc4b17 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Mon, 14 Feb 2022 20:23:48 -0800 Subject: [PATCH 12/15] Add include for ruby/version.h This is necessary for RUBY_API_VERSION_MAJOR, at least in Ruby 3.1 --- ext/stackprof/stackprof.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index 6471f162..dc1d65c9 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -7,6 +7,7 @@ **********************************************************************/ #include +#include #include #include #include From 7a8e4cad387970075ad4531ad014d5f99e164921 Mon Sep 17 00:00:00 2001 From: John Hawthorn Date: Mon, 14 Feb 2022 23:52:44 -0800 Subject: [PATCH 13/15] Avoid unsafe mallocs inside signals In 6c3f4d7c5234cb3274c4a624ea6cb52c26be3833 we started recording signals immediately on receiving a signal, after observing that in Ruby 3.0+ the CFP structure was safe to read. This resulted in much more accurate measurements. Unfortunately though rb_profile_frames seems to be safe to call inside a signal handler, stackprof_record_sample_for_stack is not, since it calls malloc (itself or through st_*) which is not async-signal-safe. This could potentially result in deadlocks or memory corruption depending on the malloc implementation. This commit attempts to restore the safety of the postponed job approach while keeping the accuracy of measuring immediately. Inside the signal handler we record the frames to a buffer, then enqueue a postponed job to do the actual recording. --- ext/stackprof/stackprof.c | 68 ++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/ext/stackprof/stackprof.c b/ext/stackprof/stackprof.c index dc1d65c9..21e65b09 100644 --- a/ext/stackprof/stackprof.c +++ b/ext/stackprof/stackprof.c @@ -126,6 +126,9 @@ static struct { VALUE fake_frame_names[TOTAL_FAKE_FRAMES]; VALUE empty_string; + + int buffer_count; + sample_time_t buffer_time; VALUE frames_buffer[BUF_SIZE]; int lines_buffer[BUF_SIZE]; } _stackprof; @@ -594,9 +597,17 @@ stackprof_record_sample_for_stack(int num, uint64_t sample_timestamp, int64_t ti } } +// buffer the current profile frames +// This must be async-signal-safe +// Returns immediately if another set of frames are already in the buffer void -stackprof_record_sample() +stackprof_buffer_sample(void) { + if (_stackprof.buffer_count > 0) { + // Another sample is already pending + return; + } + uint64_t start_timestamp = 0; int64_t timestamp_delta = 0; int num; @@ -606,12 +617,16 @@ stackprof_record_sample() start_timestamp = timestamp_usec(&t); timestamp_delta = delta_usec(&t, &_stackprof.last_sample_at); } + num = rb_profile_frames(0, sizeof(_stackprof.frames_buffer) / sizeof(VALUE), _stackprof.frames_buffer, _stackprof.lines_buffer); - stackprof_record_sample_for_stack(num, start_timestamp, timestamp_delta); + + _stackprof.buffer_count = num; + _stackprof.buffer_time.timestamp_usec = start_timestamp; + _stackprof.buffer_time.delta_usec = timestamp_delta; } void -stackprof_record_gc_samples() +stackprof_record_gc_samples(void) { int64_t delta_to_first_unrecorded_gc_sample = 0; uint64_t start_timestamp = 0; @@ -661,20 +676,47 @@ stackprof_record_gc_samples() _stackprof.unrecorded_gc_sweeping_samples = 0; } +// record the sample previously buffered by stackprof_buffer_sample static void -stackprof_gc_job_handler(void *data) +stackprof_record_buffer(void) +{ + stackprof_record_sample_for_stack(_stackprof.buffer_count, _stackprof.buffer_time.timestamp_usec, _stackprof.buffer_time.delta_usec); + + // reset the buffer + _stackprof.buffer_count = 0; +} + +static void +stackprof_sample_and_record(void) +{ + stackprof_buffer_sample(); + stackprof_record_buffer(); +} + +static void +stackprof_job_record_gc(void *data) { if (!_stackprof.running) return; stackprof_record_gc_samples(); } +#ifdef USE_POSTPONED_JOB +static void +stackprof_job_sample_and_record(void *data) +{ + if (!_stackprof.running) return; + + stackprof_sample_and_record(); +} +#endif + static void -stackprof_job_handler(void *data) +stackprof_job_record_buffer(void *data) { if (!_stackprof.running) return; - stackprof_record_sample(); + stackprof_record_buffer(); } static void @@ -696,12 +738,16 @@ stackprof_signal_handler(int sig, siginfo_t *sinfo, void *ucontext) _stackprof.unrecorded_gc_sweeping_samples++; } _stackprof.unrecorded_gc_samples++; - rb_postponed_job_register_one(0, stackprof_gc_job_handler, (void*)0); + rb_postponed_job_register_one(0, stackprof_job_record_gc, (void*)0); } else { #ifdef USE_POSTPONED_JOB - rb_postponed_job_register_one(0, stackprof_job_handler, (void*)0); + rb_postponed_job_register_one(0, stackprof_job_sample_and_record, (void*)0); #else - stackprof_job_handler(0); + // Buffer a sample immediately, if an existing sample exists this will + // return immediately + stackprof_buffer_sample(); + // Enqueue a job to record the sample + rb_postponed_job_register_one(0, stackprof_job_record_buffer, (void*)0); #endif } pthread_mutex_unlock(&lock); @@ -713,7 +759,7 @@ stackprof_newobj_handler(VALUE tpval, void *data) _stackprof.overall_signals++; if (RTEST(_stackprof.interval) && _stackprof.overall_signals % NUM2LONG(_stackprof.interval)) return; - stackprof_job_handler(0); + stackprof_sample_and_record(); } static VALUE @@ -723,7 +769,7 @@ stackprof_sample(VALUE self) return Qfalse; _stackprof.overall_signals++; - stackprof_job_handler(0); + stackprof_sample_and_record(); return Qtrue; } From 0e24173708c8c348dad9dab78c4bfc08bcd281cc Mon Sep 17 00:00:00 2001 From: Aaron Patterson Date: Wed, 16 Feb 2022 15:25:29 -0800 Subject: [PATCH 14/15] bump version --- lib/stackprof.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stackprof.rb b/lib/stackprof.rb index fc49e532..4aec0bb0 100644 --- a/lib/stackprof.rb +++ b/lib/stackprof.rb @@ -1,7 +1,7 @@ require "stackprof/stackprof" module StackProf - VERSION = '0.2.17' + VERSION = '0.2.18' end StackProf.autoload :Report, "stackprof/report.rb" From b8c9c6a5a369b641d858949c69a0d9bdec4e084f Mon Sep 17 00:00:00 2001 From: Aaron Patterson Date: Wed, 16 Feb 2022 15:26:05 -0800 Subject: [PATCH 15/15] bump version --- stackprof.gemspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stackprof.gemspec b/stackprof.gemspec index 6e4c566c..cd1a4c6e 100644 --- a/stackprof.gemspec +++ b/stackprof.gemspec @@ -1,6 +1,6 @@ Gem::Specification.new do |s| s.name = 'stackprof' - s.version = '0.2.17' + s.version = '0.2.18' s.homepage = 'http://github.com/tmm1/stackprof' s.authors = 'Aman Gupta'