feat: add replace method to DataFrame (#261)

TrevorBergeron · web-flow · commit 5092215767d7 · 2023-12-19T11:47:25.000-08:00
* feat: add replace method to DataFrame

* remove unwanted change to describe method

* better docs

* is_patype docstring

* docstring fix

* mypy fix
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1562,6 +1562,21 @@ def interpolate(self, method: str = "linear") -> DataFrame:
     def fillna(self, value=None) -> DataFrame:
         return self._apply_binop(value, ops.fillna_op, how="left")
 
+    def replace(
+        self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
+    ):
+        if utils.is_dict_like(value):
+            return self.apply(
+                lambda x: x.replace(
+                    to_replace=to_replace, value=value[x.name], regex=regex
+                )
+                if (x.name in value)
+                else x
+            )
+        return self.apply(
+            lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
+        )
+
     def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
         window = bigframes.core.WindowSpec(preceding=limit, following=0)
         return self._apply_window_op(agg_ops.LastNonNullOp(), window)
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
@@ -14,6 +14,7 @@
 
 """Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
 
+import datetime
 import textwrap
 import typing
 from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -437,3 +438,50 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
                 gcb3p_pandas_helpers.bq_to_arrow_data_type(field)
             )
     return dtypes
+
+
+def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
+    """Captures whether a scalar can be losslessly represented by a dtype."""
+    if scalar is None:
+        return True
+    if pd.api.types.is_bool_dtype(dtype):
+        return pd.api.types.is_bool(scalar)
+    if pd.api.types.is_float_dtype(dtype):
+        return pd.api.types.is_float(scalar)
+    if pd.api.types.is_integer_dtype(dtype):
+        return pd.api.types.is_integer(scalar)
+    if isinstance(dtype, pd.StringDtype):
+        return isinstance(scalar, str)
+    if isinstance(dtype, pd.ArrowDtype):
+        pa_type = dtype.pyarrow_dtype
+        return is_patype(scalar, pa_type)
+    return False
+
+
+def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
+    """Determine whether a scalar's type matches a given pyarrow type."""
+    if pa_type == pa.time64("us"):
+        return isinstance(scalar, datetime.time)
+    if pa_type == pa.timestamp("us"):
+        if isinstance(scalar, datetime.datetime):
+            return not scalar.tzinfo
+        if isinstance(scalar, pd.Timestamp):
+            return not scalar.tzinfo
+    if pa_type == pa.timestamp("us", tz="UTC"):
+        if isinstance(scalar, datetime.datetime):
+            return scalar.tzinfo == datetime.timezone.utc
+        if isinstance(scalar, pd.Timestamp):
+            return scalar.tzinfo == datetime.timezone.utc
+    if pa_type == pa.date32():
+        return isinstance(scalar, datetime.date)
+    return False
+
+
+def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
+    """Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
+    if is_dtype(scalar, dtype):
+        return True
+    elif pd.api.types.is_numeric_dtype(dtype):
+        return pd.api.types.is_number(scalar)
+    else:
+        return False
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
@@ -523,6 +523,20 @@ def _as_ibis(self, x: ibis_types.Value):
         return bigframes.dtypes.cast_ibis_value(x, self.to_type)
 
 
+class MapOp(UnaryOp):
+    def __init__(
+        self,
+        mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...],
+    ):
+        self._mappings = mappings
+
+    def _as_ibis(self, x: ibis_types.Value):
+        case = ibis.case()
+        for mapping in self._mappings:
+            case = case.when(x == mapping[0], mapping[1])
+        return case.else_(x).end()
+
+
 class FindOp(UnaryOp):
     def __init__(self, sub, start, end):
         self._sub = sub
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -442,42 +442,67 @@ def replace(
         self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
     ):
         if regex:
-            if not (isinstance(to_replace, str) and isinstance(value, str)):
-                raise NotImplementedError(
-                    f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}"
-                )
-            block, result_col = self._block.apply_unary_op(
-                self._value_column,
-                ops.ReplaceRegexOp(to_replace, value),
-                result_label=self.name,
-            )
-            return Series(block.select_column(result_col))
+            # No-op unless to_replace and series dtype are both string type
+            if not isinstance(to_replace, str) or not isinstance(
+                self.dtype, pandas.StringDtype
+            ):
+                return self
+            return self._regex_replace(to_replace, value)
         elif utils.is_dict_like(to_replace):
-            raise NotImplementedError(
-                f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
-            )
+            return self._mapping_replace(to_replace)  # type: ignore
         elif utils.is_list_like(to_replace):
-            block, cond = self._block.apply_unary_op(
-                self._value_column, ops.IsInOp(to_replace)
-            )
-            block, result_col = block.apply_binary_op(
-                cond,
-                self._value_column,
-                ops.partial_arg1(ops.where_op, value),
-                result_label=self.name,
-            )
-            return Series(block.select_column(result_col))
+            replace_list = to_replace
         else:  # Scalar
-            block, cond = self._block.apply_unary_op(
-                self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace)
+            replace_list = [to_replace]
+        replace_list = [
+            i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype)
+        ]
+        return self._simple_replace(replace_list, value) if replace_list else self
+
+    def _regex_replace(self, to_replace: str, value: str):
+        if not bigframes.dtypes.is_dtype(value, self.dtype):
+            raise NotImplementedError(
+                f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
             )
-            block, result_col = block.apply_binary_op(
-                cond,
-                self._value_column,
-                ops.partial_arg1(ops.where_op, value),
-                result_label=self.name,
+        block, result_col = self._block.apply_unary_op(
+            self._value_column,
+            ops.ReplaceRegexOp(to_replace, value),
+            result_label=self.name,
+        )
+        return Series(block.select_column(result_col))
+
+    def _simple_replace(self, to_replace_list: typing.Sequence, value):
+        if not bigframes.dtypes.is_dtype(value, self.dtype):
+            raise NotImplementedError(
+                f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
             )
-            return Series(block.select_column(result_col))
+
+        block, cond = self._block.apply_unary_op(
+            self._value_column, ops.IsInOp(to_replace_list)
+        )
+        block, result_col = block.apply_binary_op(
+            cond,
+            self._value_column,
+            ops.partial_arg1(ops.where_op, value),
+            result_label=self.name,
+        )
+        return Series(block.select_column(result_col))
+
+    def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]):
+        tuples = []
+        for key, value in mapping.items():
+            if not bigframes.dtypes.is_comparable(key, self.dtype):
+                continue
+            if not bigframes.dtypes.is_dtype(value, self.dtype):
+                raise NotImplementedError(
+                    f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
+                )
+            tuples.append((key, value))
+
+        block, result = self._block.apply_unary_op(
+            self._value_column, ops.MapOp(tuple(tuples))
+        )
+        return Series(block.select_column(result))
 
     def interpolate(self, method: str = "linear") -> Series:
         if method == "pad":
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs):
     pandas.testing.assert_frame_equal(bf_result, pd_result)
 
 
+def test_df_replace_scalar_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
+    pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_regex_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
+    pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True)
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_list_scalar(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
+    pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+def test_df_replace_value_dict(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+    bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas()
+    pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200})
+
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
 def test_df_ffill(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -4356,6 +4356,94 @@ def fillna(self, value):
         """
         raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
 
+    def replace(
+        self,
+        to_replace,
+        value=None,
+        *,
+        regex=False,
+    ):
+        """
+        Replace values given in `to_replace` with `value`.
+
+        Values of the Series/DataFrame are replaced with other values dynamically.
+        This differs from updating with ``.loc`` or ``.iloc``, which require
+        you to specify a location to update with some value.
+
+        **Examples:**
+
+            >>> import bigframes.pandas as bpd
+            >>> bpd.options.display.progress_bar = None
+
+            >>> df = bpd.DataFrame({
+            ...     'int_col': [1, 1, 2, 3],
+            ...     'string_col': ["a", "b", "c", "b"],
+            ...     })
+
+        Using scalar `to_replace` and `value`:
+
+            >>> df.replace("b", "e")
+               int_col string_col
+            0        1          a
+            1        1          e
+            2        2          c
+            3        3          e
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        Using dictionary:
+
+            >>> df.replace({"a": "e", 2: 5})
+               int_col string_col
+            0        1          e
+            1        1          b
+            2        5          c
+            3        3          b
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+        Using regex:
+
+            >>> df.replace("[ab]", "e", regex=True)
+               int_col string_col
+            0        1          e
+            1        1          e
+            2        2          c
+            3        3          e
+            <BLANKLINE>
+            [4 rows x 2 columns]
+
+
+        Args:
+            to_replace (str, regex, list, int, float or None):
+                How to find the values that will be replaced.
+                numeric: numeric values equal to `to_replace` will be replaced with `value`
+                str: string exactly matching `to_replace` will be replaced with `value`
+                regex: regexs matching `to_replace` will be replaced with`value`
+                list of str, regex, or numeric:
+                First, if `to_replace` and `value` are both lists, they **must** be the same length.
+                Second, if ``regex=True`` then all of the strings in **both**
+                lists will be interpreted as regexs otherwise they will match
+                directly. This doesn't matter much for `value` since there
+                are only a few possible substitution regexes you can use.
+                str, regex and numeric rules apply as above.
+
+            value (scalar, default None):
+                Value to replace any values matching `to_replace` with.
+                For a DataFrame a dict of values can be used to specify which
+                value to use for each column (columns not in the dict will not be
+                filled). Regular expressions, strings and lists or dicts of such
+                objects are also allowed.
+            regex (bool, default False):
+                Whether to interpret `to_replace` and/or `value` as regular
+                expressions. If this is ``True`` then `to_replace` *must* be a
+                string.
+
+        Returns:
+            Series/DataFrame: Object after replacement.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
     @property
     def iloc(self):
         """Purely integer-location based indexing for selection by position."""