CLN: Rename "add" to "sum" in groupby (pandas-dev#47892)

rhshadrach · web-flow · commit 0b6d1207d431 · 2022-07-29T13:56:42.000-07:00
* CLN: Rename "add" to "sum"

* revert
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -50,7 +50,7 @@ def group_any_all(
     val_test: Literal["any", "all"],
     skipna: bool,
 ) -> None: ...
-def group_add(
+def group_sum(
     out: np.ndarray,  # complexfloating_t[:, ::1]
     counts: np.ndarray,  # int64_t[::1]
     values: np.ndarray,  # ndarray[complexfloating_t, ndim=2]
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -124,7 +124,7 @@ def group_median_float64(
         ndarray[intp_t] indexer
         float64_t* ptr
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     ngroups = len(counts)
     N, K = (<object>values).shape
@@ -502,7 +502,7 @@ def group_any_all(
 
 
 # ----------------------------------------------------------------------
-# group_add, group_prod, group_var, group_mean, group_ohlc
+# group_sum, group_prod, group_var, group_mean, group_ohlc
 # ----------------------------------------------------------------------
 
 ctypedef fused mean_t:
@@ -511,17 +511,17 @@ ctypedef fused mean_t:
     complex64_t
     complex128_t
 
-ctypedef fused add_t:
+ctypedef fused sum_t:
     mean_t
     object
 
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_add(
-    add_t[:, ::1] out,
+def group_sum(
+    sum_t[:, ::1] out,
     int64_t[::1] counts,
-    ndarray[add_t, ndim=2] values,
+    ndarray[sum_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=0,
     bint is_datetimelike=False,
@@ -531,8 +531,8 @@ def group_add(
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        add_t val, t, y
-        add_t[:, ::1] sumx, compensation
+        sum_t val, t, y
+        sum_t[:, ::1] sumx, compensation
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
@@ -546,7 +546,7 @@ def group_add(
 
     N, K = (<object>values).shape
 
-    if add_t is object:
+    if sum_t is object:
         # NB: this does not use 'compensation' like the non-object track does.
         for i in range(N):
             lab = labels[i]
@@ -588,10 +588,10 @@ def group_add(
 
                     # not nan
                     # With dt64/td64 values, values have been cast to float64
-                    #  instead if int64 for group_add, but the logic
+                    #  instead if int64 for group_sum, but the logic
                     #  is otherwise the same as in _treat_as_na
                     if val == val and not (
-                        add_t is float64_t
+                        sum_t is float64_t
                         and is_datetimelike
                         and val == <float64_t>NPY_NAT
                     ):
@@ -677,7 +677,7 @@ def group_var(
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
@@ -745,7 +745,7 @@ def group_mean(
         Array containing unique label for each group, with its
         ordering matching up to the corresponding record in `values`.
     min_count : Py_ssize_t
-        Only used in add and prod. Always -1.
+        Only used in sum and prod. Always -1.
     is_datetimelike : bool
         True if `values` contains datetime-like entries.
     mask : ndarray[bool, ndim=2], optional
@@ -766,7 +766,7 @@ def group_mean(
         int64_t[:, ::1] nobs
         Py_ssize_t len_values = len(values), len_labels = len(labels)
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len_values != len_labels:
         raise ValueError("len(index) != len(labels)")
@@ -821,7 +821,7 @@ def group_ohlc(
         Py_ssize_t i, j, N, K, lab
         floating val
 
-    assert min_count == -1, "'min_count' only used in add and prod"
+    assert min_count == -1, "'min_count' only used in sum and prod"
 
     if len(labels) == 0:
         return
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1338,7 +1338,6 @@ def _resolve_numeric_only(
 
         if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
             # GH#47500
-            how = "sum" if how == "add" else how
             warnings.warn(
                 f"{type(self).__name__}.{how} called with "
                 f"numeric_only={numeric_only} and dtype {self.obj.dtype}. This will "
@@ -1738,9 +1737,8 @@ def _cython_agg_general(
                 kwd_name = "numeric_only"
                 if how in ["any", "all"]:
                     kwd_name = "bool_only"
-                kernel = "sum" if how == "add" else how
                 raise NotImplementedError(
-                    f"{type(self).__name__}.{kernel} does not implement {kwd_name}."
+                    f"{type(self).__name__}.{how} does not implement {kwd_name}."
                 )
             elif not is_ser:
                 data = data.get_numeric_data(copy=False)
@@ -2417,7 +2415,7 @@ def sum(
                 result = self._agg_general(
                     numeric_only=numeric_only,
                     min_count=min_count,
-                    alias="add",
+                    alias="sum",
                     npfunc=np.sum,
                 )
 
@@ -4341,8 +4339,6 @@ def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiInde
 
 
 def warn_dropping_nuisance_columns_deprecated(cls, how: str, numeric_only) -> None:
-    if how == "add":
-        how = "sum"
     if numeric_only is not lib.no_default and not numeric_only:
         # numeric_only was specified and falsey but still dropped nuisance columns
         warnings.warn(
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -121,7 +121,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
 
     _CYTHON_FUNCTIONS = {
         "aggregate": {
-            "add": "group_add",
+            "sum": "group_sum",
             "prod": "group_prod",
             "min": "group_min",
             "max": "group_max",
@@ -213,7 +213,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
             values = ensure_float64(values)
 
         elif values.dtype.kind in ["i", "u"]:
-            if how in ["add", "var", "prod", "mean", "ohlc"] or (
+            if how in ["sum", "var", "prod", "mean", "ohlc"] or (
                 self.kind == "transform" and self.has_dropped_na
             ):
                 # result may still include NaN, so we have to cast
@@ -241,7 +241,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
         if isinstance(dtype, CategoricalDtype):
             # NotImplementedError for methods that can fall back to a
             #  non-cython implementation.
-            if how in ["add", "prod", "cumsum", "cumprod"]:
+            if how in ["sum", "prod", "cumsum", "cumprod"]:
                 raise TypeError(f"{dtype} type does not support {how} operations")
             elif how not in ["rank"]:
                 # only "rank" is implemented in cython
@@ -258,7 +258,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
             # TODO: same for period_dtype?  no for these methods with Period
             # we raise NotImplemented if this is an invalid operation
             #  entirely, e.g. adding datetimes
-            if how in ["add", "prod", "cumsum", "cumprod"]:
+            if how in ["sum", "prod", "cumsum", "cumprod"]:
                 raise TypeError(f"datetime64 type does not support {how} operations")
         elif is_timedelta64_dtype(dtype):
             if how in ["prod", "cumprod"]:
@@ -311,7 +311,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
         """
         how = self.how
 
-        if how in ["add", "cumsum", "sum", "prod"]:
+        if how in ["sum", "cumsum", "sum", "prod"]:
             if dtype == np.dtype(bool):
                 return np.dtype(np.int64)
         elif how in ["mean", "median", "var"]:
@@ -567,7 +567,7 @@ def _call_cython_op(
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
                 )
-            elif self.how in ["add"]:
+            elif self.how in ["sum"]:
                 # We support datetimelike
                 func(
                     out=result,
@@ -625,7 +625,7 @@ def _call_cython_op(
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cast_blocklist we get here
             # Casting only needed for float16, bool, datetimelike,
-            #  and self.how in ["add", "prod", "ohlc", "cumprod"]
+            #  and self.how in ["sum", "prod", "ohlc", "cumprod"]
             res_dtype = self._get_result_dtype(orig_values.dtype)
             op_result = maybe_downcast_to_dtype(result, res_dtype)
         else:
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -166,7 +166,7 @@ def test_cython_fail_agg():
         ("mean", np.mean),
         ("median", np.median),
         ("var", np.var),
-        ("add", np.sum),
+        ("sum", np.sum),
         ("prod", np.prod),
         ("min", np.min),
         ("max", np.max),
@@ -214,7 +214,7 @@ def test_cython_agg_empty_buckets_nanops(observed):
     grps = range(0, 25, 5)
     # add / sum
     result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
-        "add", alt=None, numeric_only=True
+        "sum", alt=None, numeric_only=True
     )
     intervals = pd.interval_range(0, 20, freq=5, inclusive="right")
     expected = DataFrame(
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -61,15 +61,15 @@ def test_custom_grouper(index):
 
     # check all cython functions work
     g.ohlc()  # doesn't use _cython_agg_general
-    funcs = ["add", "mean", "prod", "min", "max", "var"]
+    funcs = ["sum", "mean", "prod", "min", "max", "var"]
     for f in funcs:
         g._cython_agg_general(f, alt=None, numeric_only=True)
 
     b = Grouper(freq=Minute(5), closed="right", label="right")
     g = s.groupby(b)
     # check all cython functions work
     g.ohlc()  # doesn't use _cython_agg_general
-    funcs = ["add", "mean", "prod", "min", "max", "var"]
+    funcs = ["sum", "mean", "prod", "min", "max", "var"]
     for f in funcs:
         g._cython_agg_general(f, alt=None, numeric_only=True)
 
@@ -414,7 +414,7 @@ def test_resample_upsampling_picked_but_not_correct():
     tm.assert_series_equal(result2, expected)
 
 
-@pytest.mark.parametrize("f", ["add", "mean", "prod", "min", "max", "var"])
+@pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"])
 def test_resample_frame_basic_cy_funcs(f):
     df = tm.makeTimeDataFrame()