Skip to content

Commit 5092215

Browse files
feat: add replace method to DataFrame (#261)
* feat: add replace method to DataFrame * remove unwanted change to describe method * better docs * is_patype docstring * docstring fix * mypy fix
1 parent dab2f2c commit 5092215

File tree

6 files changed

+265
-31
lines changed

6 files changed

+265
-31
lines changed

bigframes/dataframe.py

+15
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,21 @@ def interpolate(self, method: str = "linear") -> DataFrame:
15621562
def fillna(self, value=None) -> DataFrame:
15631563
return self._apply_binop(value, ops.fillna_op, how="left")
15641564

1565+
def replace(
1566+
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
1567+
):
1568+
if utils.is_dict_like(value):
1569+
return self.apply(
1570+
lambda x: x.replace(
1571+
to_replace=to_replace, value=value[x.name], regex=regex
1572+
)
1573+
if (x.name in value)
1574+
else x
1575+
)
1576+
return self.apply(
1577+
lambda x: x.replace(to_replace=to_replace, value=value, regex=regex)
1578+
)
1579+
15651580
def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame:
15661581
window = bigframes.core.WindowSpec(preceding=limit, following=0)
15671582
return self._apply_window_op(agg_ops.LastNonNullOp(), window)

bigframes/dtypes.py

+48
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
"""Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
1616

17+
import datetime
1718
import textwrap
1819
import typing
1920
from typing import Any, Dict, Iterable, Literal, Tuple, Union
@@ -437,3 +438,50 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict:
437438
gcb3p_pandas_helpers.bq_to_arrow_data_type(field)
438439
)
439440
return dtypes
441+
442+
443+
def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
444+
"""Captures whether a scalar can be losslessly represented by a dtype."""
445+
if scalar is None:
446+
return True
447+
if pd.api.types.is_bool_dtype(dtype):
448+
return pd.api.types.is_bool(scalar)
449+
if pd.api.types.is_float_dtype(dtype):
450+
return pd.api.types.is_float(scalar)
451+
if pd.api.types.is_integer_dtype(dtype):
452+
return pd.api.types.is_integer(scalar)
453+
if isinstance(dtype, pd.StringDtype):
454+
return isinstance(scalar, str)
455+
if isinstance(dtype, pd.ArrowDtype):
456+
pa_type = dtype.pyarrow_dtype
457+
return is_patype(scalar, pa_type)
458+
return False
459+
460+
461+
def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool:
462+
"""Determine whether a scalar's type matches a given pyarrow type."""
463+
if pa_type == pa.time64("us"):
464+
return isinstance(scalar, datetime.time)
465+
if pa_type == pa.timestamp("us"):
466+
if isinstance(scalar, datetime.datetime):
467+
return not scalar.tzinfo
468+
if isinstance(scalar, pd.Timestamp):
469+
return not scalar.tzinfo
470+
if pa_type == pa.timestamp("us", tz="UTC"):
471+
if isinstance(scalar, datetime.datetime):
472+
return scalar.tzinfo == datetime.timezone.utc
473+
if isinstance(scalar, pd.Timestamp):
474+
return scalar.tzinfo == datetime.timezone.utc
475+
if pa_type == pa.date32():
476+
return isinstance(scalar, datetime.date)
477+
return False
478+
479+
480+
def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool:
481+
"""Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
482+
if is_dtype(scalar, dtype):
483+
return True
484+
elif pd.api.types.is_numeric_dtype(dtype):
485+
return pd.api.types.is_number(scalar)
486+
else:
487+
return False

bigframes/operations/__init__.py

+14
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,20 @@ def _as_ibis(self, x: ibis_types.Value):
523523
return bigframes.dtypes.cast_ibis_value(x, self.to_type)
524524

525525

526+
class MapOp(UnaryOp):
527+
def __init__(
528+
self,
529+
mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...],
530+
):
531+
self._mappings = mappings
532+
533+
def _as_ibis(self, x: ibis_types.Value):
534+
case = ibis.case()
535+
for mapping in self._mappings:
536+
case = case.when(x == mapping[0], mapping[1])
537+
return case.else_(x).end()
538+
539+
526540
class FindOp(UnaryOp):
527541
def __init__(self, sub, start, end):
528542
self._sub = sub

bigframes/series.py

+56-31
Original file line numberDiff line numberDiff line change
@@ -442,42 +442,67 @@ def replace(
442442
self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False
443443
):
444444
if regex:
445-
if not (isinstance(to_replace, str) and isinstance(value, str)):
446-
raise NotImplementedError(
447-
f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}"
448-
)
449-
block, result_col = self._block.apply_unary_op(
450-
self._value_column,
451-
ops.ReplaceRegexOp(to_replace, value),
452-
result_label=self.name,
453-
)
454-
return Series(block.select_column(result_col))
445+
# No-op unless to_replace and series dtype are both string type
446+
if not isinstance(to_replace, str) or not isinstance(
447+
self.dtype, pandas.StringDtype
448+
):
449+
return self
450+
return self._regex_replace(to_replace, value)
455451
elif utils.is_dict_like(to_replace):
456-
raise NotImplementedError(
457-
f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}"
458-
)
452+
return self._mapping_replace(to_replace) # type: ignore
459453
elif utils.is_list_like(to_replace):
460-
block, cond = self._block.apply_unary_op(
461-
self._value_column, ops.IsInOp(to_replace)
462-
)
463-
block, result_col = block.apply_binary_op(
464-
cond,
465-
self._value_column,
466-
ops.partial_arg1(ops.where_op, value),
467-
result_label=self.name,
468-
)
469-
return Series(block.select_column(result_col))
454+
replace_list = to_replace
470455
else: # Scalar
471-
block, cond = self._block.apply_unary_op(
472-
self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace)
456+
replace_list = [to_replace]
457+
replace_list = [
458+
i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype)
459+
]
460+
return self._simple_replace(replace_list, value) if replace_list else self
461+
462+
def _regex_replace(self, to_replace: str, value: str):
463+
if not bigframes.dtypes.is_dtype(value, self.dtype):
464+
raise NotImplementedError(
465+
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
473466
)
474-
block, result_col = block.apply_binary_op(
475-
cond,
476-
self._value_column,
477-
ops.partial_arg1(ops.where_op, value),
478-
result_label=self.name,
467+
block, result_col = self._block.apply_unary_op(
468+
self._value_column,
469+
ops.ReplaceRegexOp(to_replace, value),
470+
result_label=self.name,
471+
)
472+
return Series(block.select_column(result_col))
473+
474+
def _simple_replace(self, to_replace_list: typing.Sequence, value):
475+
if not bigframes.dtypes.is_dtype(value, self.dtype):
476+
raise NotImplementedError(
477+
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
479478
)
480-
return Series(block.select_column(result_col))
479+
480+
block, cond = self._block.apply_unary_op(
481+
self._value_column, ops.IsInOp(to_replace_list)
482+
)
483+
block, result_col = block.apply_binary_op(
484+
cond,
485+
self._value_column,
486+
ops.partial_arg1(ops.where_op, value),
487+
result_label=self.name,
488+
)
489+
return Series(block.select_column(result_col))
490+
491+
def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]):
492+
tuples = []
493+
for key, value in mapping.items():
494+
if not bigframes.dtypes.is_comparable(key, self.dtype):
495+
continue
496+
if not bigframes.dtypes.is_dtype(value, self.dtype):
497+
raise NotImplementedError(
498+
f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}"
499+
)
500+
tuples.append((key, value))
501+
502+
block, result = self._block.apply_unary_op(
503+
self._value_column, ops.MapOp(tuple(tuples))
504+
)
505+
return Series(block.select_column(result))
481506

482507
def interpolate(self, method: str = "linear") -> Series:
483508
if method == "pad":

tests/system/small/test_dataframe.py

+44
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs):
839839
pandas.testing.assert_frame_equal(bf_result, pd_result)
840840

841841

842+
def test_df_replace_scalar_scalar(scalars_dfs):
843+
scalars_df, scalars_pandas_df = scalars_dfs
844+
bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas()
845+
pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!")
846+
847+
pd.testing.assert_frame_equal(
848+
pd_result,
849+
bf_result,
850+
)
851+
852+
853+
def test_df_replace_regex_scalar(scalars_dfs):
854+
scalars_df, scalars_pandas_df = scalars_dfs
855+
bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas()
856+
pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True)
857+
858+
pd.testing.assert_frame_equal(
859+
pd_result,
860+
bf_result,
861+
)
862+
863+
864+
def test_df_replace_list_scalar(scalars_dfs):
865+
scalars_df, scalars_pandas_df = scalars_dfs
866+
bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas()
867+
pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!")
868+
869+
pd.testing.assert_frame_equal(
870+
pd_result,
871+
bf_result,
872+
)
873+
874+
875+
def test_df_replace_value_dict(scalars_dfs):
876+
scalars_df, scalars_pandas_df = scalars_dfs
877+
bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas()
878+
pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200})
879+
880+
pd.testing.assert_frame_equal(
881+
pd_result,
882+
bf_result,
883+
)
884+
885+
842886
def test_df_ffill(scalars_dfs):
843887
scalars_df, scalars_pandas_df = scalars_dfs
844888
bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas()

third_party/bigframes_vendored/pandas/core/frame.py

+88
Original file line numberDiff line numberDiff line change
@@ -4356,6 +4356,94 @@ def fillna(self, value):
43564356
"""
43574357
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
43584358

4359+
def replace(
4360+
self,
4361+
to_replace,
4362+
value=None,
4363+
*,
4364+
regex=False,
4365+
):
4366+
"""
4367+
Replace values given in `to_replace` with `value`.
4368+
4369+
Values of the Series/DataFrame are replaced with other values dynamically.
4370+
This differs from updating with ``.loc`` or ``.iloc``, which require
4371+
you to specify a location to update with some value.
4372+
4373+
**Examples:**
4374+
4375+
>>> import bigframes.pandas as bpd
4376+
>>> bpd.options.display.progress_bar = None
4377+
4378+
>>> df = bpd.DataFrame({
4379+
... 'int_col': [1, 1, 2, 3],
4380+
... 'string_col': ["a", "b", "c", "b"],
4381+
... })
4382+
4383+
Using scalar `to_replace` and `value`:
4384+
4385+
>>> df.replace("b", "e")
4386+
int_col string_col
4387+
0 1 a
4388+
1 1 e
4389+
2 2 c
4390+
3 3 e
4391+
<BLANKLINE>
4392+
[4 rows x 2 columns]
4393+
4394+
Using dictionary:
4395+
4396+
>>> df.replace({"a": "e", 2: 5})
4397+
int_col string_col
4398+
0 1 e
4399+
1 1 b
4400+
2 5 c
4401+
3 3 b
4402+
<BLANKLINE>
4403+
[4 rows x 2 columns]
4404+
4405+
Using regex:
4406+
4407+
>>> df.replace("[ab]", "e", regex=True)
4408+
int_col string_col
4409+
0 1 e
4410+
1 1 e
4411+
2 2 c
4412+
3 3 e
4413+
<BLANKLINE>
4414+
[4 rows x 2 columns]
4415+
4416+
4417+
Args:
4418+
to_replace (str, regex, list, int, float or None):
4419+
How to find the values that will be replaced.
4420+
numeric: numeric values equal to `to_replace` will be replaced with `value`
4421+
str: string exactly matching `to_replace` will be replaced with `value`
4422+
regex: regexs matching `to_replace` will be replaced with`value`
4423+
list of str, regex, or numeric:
4424+
First, if `to_replace` and `value` are both lists, they **must** be the same length.
4425+
Second, if ``regex=True`` then all of the strings in **both**
4426+
lists will be interpreted as regexs otherwise they will match
4427+
directly. This doesn't matter much for `value` since there
4428+
are only a few possible substitution regexes you can use.
4429+
str, regex and numeric rules apply as above.
4430+
4431+
value (scalar, default None):
4432+
Value to replace any values matching `to_replace` with.
4433+
For a DataFrame a dict of values can be used to specify which
4434+
value to use for each column (columns not in the dict will not be
4435+
filled). Regular expressions, strings and lists or dicts of such
4436+
objects are also allowed.
4437+
regex (bool, default False):
4438+
Whether to interpret `to_replace` and/or `value` as regular
4439+
expressions. If this is ``True`` then `to_replace` *must* be a
4440+
string.
4441+
4442+
Returns:
4443+
Series/DataFrame: Object after replacement.
4444+
"""
4445+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
4446+
43594447
@property
43604448
def iloc(self):
43614449
"""Purely integer-location based indexing for selection by position."""

0 commit comments

Comments
 (0)