Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Restore string to date/time type coercion #565

Merged
merged 3 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]:


def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
"""Get the supertype of the two types."""
if dtype1 == dtype2:
return dtype1
# Implicit conversion currently only supported for numeric types
Expand All @@ -664,12 +665,26 @@ def lcd_type(dtype1: Dtype, dtype2: Dtype) -> Dtype:
return hierarchy[lcd_index]


def lcd_etype(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType:
if etype1 is None:
def coerce_to_common(etype1: ExpressionType, etype2: ExpressionType) -> ExpressionType:
"""Coerce types to a common type or throw a TypeError"""
if etype1 is not None and etype2 is not None:
common_supertype = lcd_type(etype1, etype2)
if common_supertype is not None:
return common_supertype
if can_coerce(etype1, etype2):
return etype2
if etype2 is None:
if can_coerce(etype2, etype1):
return etype1
return lcd_type_or_throw(etype1, etype2)
raise TypeError(f"Cannot coerce {etype1} and {etype2} to a common type.")


def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool:
if source_type is None:
return True # None can be coerced to any supported type
else:
return (source_type == STRING_DTYPE) and (
target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
)


def lcd_type_or_throw(dtype1: Dtype, dtype2: Dtype) -> Dtype:
Expand Down
24 changes: 9 additions & 15 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,16 +548,10 @@ def output_type(self, *input_types):


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COMMON_SUPERTYPE)
cliplower_op = create_binary_op(
name="clip_lower", type_signature=op_typing.COMMON_SUPERTYPE
)
clipupper_op = create_binary_op(
name="clip_upper", type_signature=op_typing.COMMON_SUPERTYPE
)
coalesce_op = create_binary_op(
name="coalesce", type_signature=op_typing.COMMON_SUPERTYPE
)
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
cliplower_op = create_binary_op(name="clip_lower", type_signature=op_typing.COERCE)
clipupper_op = create_binary_op(name="clip_upper", type_signature=op_typing.COERCE)
coalesce_op = create_binary_op(name="coalesce", type_signature=op_typing.COERCE)


## Math Ops
Expand All @@ -575,7 +569,7 @@ def output_type(self, *input_types):
right_type is None or dtypes.is_numeric(right_type)
):
# Numeric addition
return dtypes.lcd_etype(left_type, right_type)
return dtypes.coerce_to_common(left_type, right_type)
# TODO: Add temporal addition once delta types supported
raise TypeError(f"Cannot add dtypes {left_type} and {right_type}")

Expand All @@ -592,7 +586,7 @@ def output_type(self, *input_types):
right_type is None or dtypes.is_numeric(right_type)
):
# Numeric subtraction
return dtypes.lcd_etype(left_type, right_type)
return dtypes.coerce_to_common(left_type, right_type)
# TODO: Add temporal addition once delta types supported
raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}")

Expand Down Expand Up @@ -652,7 +646,7 @@ class WhereOp(TernaryOp):
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
if input_types[1] != dtypes.BOOL_DTYPE:
raise TypeError("where condition must be a boolean")
return dtypes.lcd_etype(input_types[0], input_types[2])
return dtypes.coerce_to_common(input_types[0], input_types[2])


where_op = WhereOp()
Expand All @@ -663,8 +657,8 @@ class ClipOp(TernaryOp):
name: typing.ClassVar[str] = "clip"

def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
return dtypes.lcd_etype(
input_types[0], dtypes.lcd_etype(input_types[1], input_types[2])
return dtypes.coerce_to_common(
input_types[0], dtypes.coerce_to_common(input_types[1], input_types[2])
)


Expand Down
22 changes: 15 additions & 7 deletions bigframes/operations/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def output_type(
raise TypeError(f"Type {left_type} is not numeric")
if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type):
raise TypeError(f"Type {right_type} is not numeric")
return bigframes.dtypes.lcd_etype(left_type, right_type)
return bigframes.dtypes.coerce_to_common(left_type, right_type)


@dataclasses.dataclass
Expand All @@ -132,21 +132,29 @@ def output_type(
raise TypeError(f"Type {left_type} is not numeric")
if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type):
raise TypeError(f"Type {right_type} is not numeric")
lcd_type = bigframes.dtypes.lcd_etype(left_type, right_type)
lcd_type = bigframes.dtypes.coerce_to_common(left_type, right_type)
if lcd_type == bigframes.dtypes.INT_DTYPE:
# Real numeric ops produce floats on int input
return bigframes.dtypes.FLOAT_DTYPE
return lcd_type


@dataclasses.dataclass
class Supertype(BinaryTypeSignature):
"""Type signature for functions that return a the supertype of its inputs. Currently BigFrames just supports upcasting numerics."""
class CoerceCommon(BinaryTypeSignature):
"""Attempt to coerce inputs to a compatible type."""

def output_type(
self, left_type: ExpressionType, right_type: ExpressionType
) -> ExpressionType:
return bigframes.dtypes.lcd_etype(left_type, right_type)
try:
return bigframes.dtypes.coerce_to_common(left_type, right_type)
except TypeError:
pass
if bigframes.dtypes.can_coerce(left_type, right_type):
return right_type
if bigframes.dtypes.can_coerce(right_type, left_type):
return left_type
raise TypeError(f"Cannot coerce {left_type} and {right_type} to a common type.")


@dataclasses.dataclass
Expand All @@ -156,7 +164,7 @@ class Comparison(BinaryTypeSignature):
def output_type(
self, left_type: ExpressionType, right_type: ExpressionType
) -> ExpressionType:
common_type = bigframes.dtypes.lcd_etype(left_type, right_type)
common_type = CoerceCommon().output_type(left_type, right_type)
if not bigframes.dtypes.is_comparable(common_type):
raise TypeError(f"Types {left_type} and {right_type} are not comparable")
return bigframes.dtypes.BOOL_DTYPE
Expand Down Expand Up @@ -188,7 +196,7 @@ def output_type(
BINARY_NUMERIC = BinaryNumeric()
BINARY_REAL_NUMERIC = BinaryRealNumeric()
COMPARISON = Comparison()
COMMON_SUPERTYPE = Supertype()
COERCE = CoerceCommon()
LOGICAL = Logical()
STRING_TRANSFORM = TypePreserving(
bigframes.dtypes.is_string_like, description="numeric"
Expand Down
64 changes: 64 additions & 0 deletions tests/system/small/operations/test_datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime

import pandas as pd
import pytest

Expand Down Expand Up @@ -303,3 +305,65 @@ def test_dt_floor(scalars_dfs, col_name, freq):
pd_result.astype(scalars_df[col_name].dtype), # floor preserves type
bf_result,
)


def test_dt_compare_coerce_str_datetime(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series: bigframes.series.Series = scalars_df["datetime_col"]
bf_result = (bf_series >= "2024-01-01").to_pandas()

pd_result = scalars_pandas_df["datetime_col"] >= pd.to_datetime("2024-01-01")

# pandas produces pyarrow bool dtype
assert_series_equal(pd_result, bf_result, check_dtype=False)


def test_dt_clip_datetime_literals(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series: bigframes.series.Series = scalars_df["date_col"]
bf_result = bf_series.clip(
datetime.date(2020, 1, 1), datetime.date(2024, 1, 1)
).to_pandas()

pd_result = scalars_pandas_df["date_col"].clip(
datetime.date(2020, 1, 1), datetime.date(2024, 1, 1)
)

assert_series_equal(
pd_result,
bf_result,
)


def test_dt_clip_coerce_str_date(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series: bigframes.series.Series = scalars_df["date_col"]
bf_result = bf_series.clip("2020-01-01", "2024-01-01").to_pandas()

# Pandas can't coerce with pyarrow types so convert first
pd_result = scalars_pandas_df["date_col"].clip(
datetime.date(2020, 1, 1), datetime.date(2024, 1, 1)
)

assert_series_equal(
pd_result,
bf_result,
)


def test_dt_clip_coerce_str_timestamp(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series: bigframes.series.Series = scalars_df["timestamp_col"]
bf_result = bf_series.clip(
"2020-01-01T20:03:50Z", "2024-01-01T20:03:50Z"
).to_pandas()

pd_result = scalars_pandas_df["timestamp_col"].clip(
pd.to_datetime("2020-01-01T20:03:50Z", utc=True),
pd.to_datetime("2024-01-01T20:03:50Z", utc=True),
)

assert_series_equal(
pd_result,
bf_result,
)