Skip to content

Commit 64d8948

Browse files
noamherjreback
authored andcommitted
Refactor groupby group_prod, group_var, group_mean, group_ohlc (#25249)
1 parent ec5f911 commit 64d8948

File tree

3 files changed

+216
-218
lines changed

3 files changed

+216
-218
lines changed

pandas/_libs/groupby.pyx

+214-3
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,10 @@ def group_any_all(uint8_t[:] out,
382382
if values[i] == flag_val:
383383
out[lab] = flag_val
384384

385+
# ----------------------------------------------------------------------
386+
# group_add, group_prod, group_var, group_mean, group_ohlc
387+
# ----------------------------------------------------------------------
388+
385389

386390
@cython.wraparound(False)
387391
@cython.boundscheck(False)
@@ -396,9 +400,9 @@ def _group_add(floating[:, :] out,
396400
cdef:
397401
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
398402
floating val, count
399-
ndarray[floating, ndim=2] sumx, nobs
403+
floating[:, :] sumx, nobs
400404

401-
if not len(values) == len(labels):
405+
if len(values) != len(labels):
402406
raise AssertionError("len(index) != len(labels)")
403407

404408
nobs = np.zeros_like(out)
@@ -407,7 +411,6 @@ def _group_add(floating[:, :] out,
407411
N, K = (<object>values).shape
408412

409413
with nogil:
410-
411414
for i in range(N):
412415
lab = labels[i]
413416
if lab < 0:
@@ -433,5 +436,213 @@ def _group_add(floating[:, :] out,
433436
group_add_float32 = _group_add['float']
434437
group_add_float64 = _group_add['double']
435438

439+
440+
@cython.wraparound(False)
441+
@cython.boundscheck(False)
442+
def _group_prod(floating[:, :] out,
443+
int64_t[:] counts,
444+
floating[:, :] values,
445+
const int64_t[:] labels,
446+
Py_ssize_t min_count=0):
447+
"""
448+
Only aggregates on axis=0
449+
"""
450+
cdef:
451+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
452+
floating val, count
453+
floating[:, :] prodx, nobs
454+
455+
if not len(values) == len(labels):
456+
raise AssertionError("len(index) != len(labels)")
457+
458+
nobs = np.zeros_like(out)
459+
prodx = np.ones_like(out)
460+
461+
N, K = (<object>values).shape
462+
463+
with nogil:
464+
for i in range(N):
465+
lab = labels[i]
466+
if lab < 0:
467+
continue
468+
469+
counts[lab] += 1
470+
for j in range(K):
471+
val = values[i, j]
472+
473+
# not nan
474+
if val == val:
475+
nobs[lab, j] += 1
476+
prodx[lab, j] *= val
477+
478+
for i in range(ncounts):
479+
for j in range(K):
480+
if nobs[i, j] < min_count:
481+
out[i, j] = NAN
482+
else:
483+
out[i, j] = prodx[i, j]
484+
485+
486+
group_prod_float32 = _group_prod['float']
487+
group_prod_float64 = _group_prod['double']
488+
489+
490+
@cython.wraparound(False)
491+
@cython.boundscheck(False)
492+
@cython.cdivision(True)
493+
def _group_var(floating[:, :] out,
494+
int64_t[:] counts,
495+
floating[:, :] values,
496+
const int64_t[:] labels,
497+
Py_ssize_t min_count=-1):
498+
cdef:
499+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
500+
floating val, ct, oldmean
501+
floating[:, :] nobs, mean
502+
503+
assert min_count == -1, "'min_count' only used in add and prod"
504+
505+
if not len(values) == len(labels):
506+
raise AssertionError("len(index) != len(labels)")
507+
508+
nobs = np.zeros_like(out)
509+
mean = np.zeros_like(out)
510+
511+
N, K = (<object>values).shape
512+
513+
out[:, :] = 0.0
514+
515+
with nogil:
516+
for i in range(N):
517+
lab = labels[i]
518+
if lab < 0:
519+
continue
520+
521+
counts[lab] += 1
522+
523+
for j in range(K):
524+
val = values[i, j]
525+
526+
# not nan
527+
if val == val:
528+
nobs[lab, j] += 1
529+
oldmean = mean[lab, j]
530+
mean[lab, j] += (val - oldmean) / nobs[lab, j]
531+
out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
532+
533+
for i in range(ncounts):
534+
for j in range(K):
535+
ct = nobs[i, j]
536+
if ct < 2:
537+
out[i, j] = NAN
538+
else:
539+
out[i, j] /= (ct - 1)
540+
541+
542+
group_var_float32 = _group_var['float']
543+
group_var_float64 = _group_var['double']
544+
545+
546+
@cython.wraparound(False)
547+
@cython.boundscheck(False)
548+
def _group_mean(floating[:, :] out,
549+
int64_t[:] counts,
550+
floating[:, :] values,
551+
const int64_t[:] labels,
552+
Py_ssize_t min_count=-1):
553+
cdef:
554+
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
555+
floating val, count
556+
floating[:, :] sumx, nobs
557+
558+
assert min_count == -1, "'min_count' only used in add and prod"
559+
560+
if not len(values) == len(labels):
561+
raise AssertionError("len(index) != len(labels)")
562+
563+
nobs = np.zeros_like(out)
564+
sumx = np.zeros_like(out)
565+
566+
N, K = (<object>values).shape
567+
568+
with nogil:
569+
for i in range(N):
570+
lab = labels[i]
571+
if lab < 0:
572+
continue
573+
574+
counts[lab] += 1
575+
for j in range(K):
576+
val = values[i, j]
577+
# not nan
578+
if val == val:
579+
nobs[lab, j] += 1
580+
sumx[lab, j] += val
581+
582+
for i in range(ncounts):
583+
for j in range(K):
584+
count = nobs[i, j]
585+
if nobs[i, j] == 0:
586+
out[i, j] = NAN
587+
else:
588+
out[i, j] = sumx[i, j] / count
589+
590+
591+
group_mean_float32 = _group_mean['float']
592+
group_mean_float64 = _group_mean['double']
593+
594+
595+
@cython.wraparound(False)
596+
@cython.boundscheck(False)
597+
def _group_ohlc(floating[:, :] out,
598+
int64_t[:] counts,
599+
floating[:, :] values,
600+
const int64_t[:] labels,
601+
Py_ssize_t min_count=-1):
602+
"""
603+
Only aggregates on axis=0
604+
"""
605+
cdef:
606+
Py_ssize_t i, j, N, K, lab
607+
floating val, count
608+
Py_ssize_t ngroups = len(counts)
609+
610+
assert min_count == -1, "'min_count' only used in add and prod"
611+
612+
if len(labels) == 0:
613+
return
614+
615+
N, K = (<object>values).shape
616+
617+
if out.shape[1] != 4:
618+
raise ValueError('Output array must have 4 columns')
619+
620+
if K > 1:
621+
raise NotImplementedError("Argument 'values' must have only "
622+
"one dimension")
623+
out[:] = np.nan
624+
625+
with nogil:
626+
for i in range(N):
627+
lab = labels[i]
628+
if lab == -1:
629+
continue
630+
631+
counts[lab] += 1
632+
val = values[i, 0]
633+
if val != val:
634+
continue
635+
636+
if out[lab, 0] != out[lab, 0]:
637+
out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
638+
else:
639+
out[lab, 1] = max(out[lab, 1], val)
640+
out[lab, 2] = min(out[lab, 2], val)
641+
out[lab, 3] = val
642+
643+
644+
group_ohlc_float32 = _group_ohlc['float']
645+
group_ohlc_float64 = _group_ohlc['double']
646+
436647
# generated from template
437648
include "groupby_helper.pxi"

0 commit comments

Comments
 (0)