Skip to content

Commit fde339b

Browse files
junyazhangGarrettWuTrevorBergeron
authored
feat: support datetime related casting in (Series|DataFrame|Index).astype (#442)
* feat: support datetime related casting in (Series|DataFrame|Index).astype * chore: add deferred exec code samples (#439) * chore: add deferred exec code samples * fix tests * fix tests * feat: add DataFrame.pipe() method (#421) * addressed comments --------- Co-authored-by: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Co-authored-by: TrevorBergeron <tbergeron@google.com>
1 parent 95f5a6e commit fde339b

File tree

3 files changed

+183
-22
lines changed

3 files changed

+183
-22
lines changed

bigframes/core/compile/scalar_op_compiler.py

+46-13
Original file line numberDiff line numberDiff line change
@@ -634,11 +634,56 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
634634
return struct_value[name].name(name)
635635

636636

637+
def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
638+
if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
639+
x, ibis_types.FloatingValue
640+
):
641+
raise TypeError("Non-numerical types are not supposed to reach this function.")
642+
643+
if unit not in UNIT_TO_US_CONVERSION_FACTORS:
644+
raise ValueError(f"Cannot convert input with unit '{unit}'.")
645+
x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
646+
x_converted = x_converted.cast(ibis_dtypes.int64)
647+
648+
# Note: Due to an issue where casting directly to a timestamp
649+
# without a timezone does not work, we first cast to UTC. This
650+
# approach appears to bypass a potential bug in Ibis's cast function,
651+
# allowing for subsequent casting to a timestamp type without timezone
652+
# information. Further investigation is needed to confirm this behavior.
653+
return x_converted.to_timestamp(unit="us").cast(
654+
ibis_dtypes.Timestamp(timezone="UTC")
655+
)
656+
657+
637658
@scalar_op_compiler.register_unary_op(ops.AsTypeOp, pass_op=True)
638659
def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
639660
to_type = bigframes.dtypes.bigframes_dtype_to_ibis_dtype(op.to_type)
640661
if isinstance(x, ibis_types.NullScalar):
641662
return ibis_types.null().cast(to_type)
663+
664+
# When casting DATETIME column into INT column, we need to convert the column into TIMESTAMP first.
665+
if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.timestamp:
666+
x_converted = x.cast(ibis_dtypes.Timestamp(timezone="UTC"))
667+
return bigframes.dtypes.cast_ibis_value(x_converted, to_type)
668+
669+
if to_type == ibis_dtypes.int64 and x.type() == ibis_dtypes.time:
670+
# The conversion unit is set to "us" (microseconds) for consistency
671+
# with pandas converting time64[us][pyarrow] to int64[pyarrow].
672+
return x.delta(ibis.time("00:00:00"), part="microsecond")
673+
674+
if x.type() == ibis_dtypes.int64:
675+
# The conversion unit is set to "us" (microseconds) for consistency
676+
# with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
677+
# timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
678+
unit = "us"
679+
x_converted = numeric_to_datatime(x, unit)
680+
if to_type == ibis_dtypes.timestamp:
681+
return x_converted.cast(ibis_dtypes.Timestamp())
682+
elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
683+
return x_converted
684+
elif to_type == ibis_dtypes.time:
685+
return x_converted.time()
686+
642687
return bigframes.dtypes.cast_ibis_value(x, to_type)
643688

644689

@@ -677,19 +722,7 @@ def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
677722
# The default unit is set to "ns" (nanoseconds) for consistency
678723
# with pandas, where "ns" is the default unit for datetime operations.
679724
unit = op.unit or "ns"
680-
if unit not in UNIT_TO_US_CONVERSION_FACTORS:
681-
raise ValueError(f"Cannot convert input with unit '{unit}'.")
682-
x_converted = x * UNIT_TO_US_CONVERSION_FACTORS[unit]
683-
x_converted = x_converted.cast(ibis_dtypes.int64)
684-
685-
# Note: Due to an issue where casting directly to a timestamp
686-
# without a timezone does not work, we first cast to UTC. This
687-
# approach appears to bypass a potential bug in Ibis's cast function,
688-
# allowing for subsequent casting to a timestamp type without timezone
689-
# information. Further investigation is needed to confirm this behavior.
690-
x = x_converted.to_timestamp(unit="us").cast(
691-
ibis_dtypes.Timestamp(timezone="UTC")
692-
)
725+
x = numeric_to_datatime(x, unit)
693726

694727
return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))
695728

bigframes/dtypes.py

+39-9
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
"boolean",
6161
"Float64",
6262
"Int64",
63+
"int64[pyarrow]",
6364
"string",
6465
"string[pyarrow]",
6566
"timestamp[us, tz=UTC][pyarrow]",
@@ -173,6 +174,9 @@
173174
# "string" and "string[pyarrow]" are accepted
174175
BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")
175176

177+
# special case - both "Int64" and "int64[pyarrow]" are accepted
178+
BIGFRAMES_STRING_TO_BIGFRAMES["int64[pyarrow]"] = pd.Int64Dtype()
179+
176180
# For the purposes of dataframe.memory_usage
177181
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
178182
DTYPE_BYTE_SIZES = {
@@ -310,11 +314,12 @@ def bigframes_dtype_to_ibis_dtype(
310314
textwrap.dedent(
311315
f"""
312316
Unexpected data type {bigframes_dtype}. The following
313-
str dtypes are supppted: 'boolean','Float64','Int64', 'string',
314-
'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]',
315-
'timestamp[us][pyarrow]','date32[day][pyarrow]',
316-
'time64[us][pyarrow]'. The following pandas.ExtensionDtype are
317-
supported: pandas.BooleanDtype(), pandas.Float64Dtype(),
317+
str dtypes are supppted: 'boolean','Float64','Int64',
318+
'int64[pyarrow]','string','string[pyarrow]',
319+
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
320+
'date32[day][pyarrow]','time64[us][pyarrow]'.
321+
The following pandas.ExtensionDtype are supported:
322+
pandas.BooleanDtype(), pandas.Float64Dtype(),
318323
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
319324
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
320325
pd.ArrowDtype(pa.timestamp("us")),
@@ -434,6 +439,9 @@ def cast_ibis_value(
434439
ibis_dtypes.string,
435440
ibis_dtypes.Decimal(precision=38, scale=9),
436441
ibis_dtypes.Decimal(precision=76, scale=38),
442+
ibis_dtypes.time,
443+
ibis_dtypes.timestamp,
444+
ibis_dtypes.Timestamp(timezone="UTC"),
437445
),
438446
ibis_dtypes.float64: (
439447
ibis_dtypes.string,
@@ -447,8 +455,15 @@ def cast_ibis_value(
447455
ibis_dtypes.Decimal(precision=38, scale=9),
448456
ibis_dtypes.Decimal(precision=76, scale=38),
449457
ibis_dtypes.binary,
458+
ibis_dtypes.date,
459+
ibis_dtypes.timestamp,
460+
ibis_dtypes.Timestamp(timezone="UTC"),
461+
),
462+
ibis_dtypes.date: (
463+
ibis_dtypes.string,
464+
ibis_dtypes.timestamp,
465+
ibis_dtypes.Timestamp(timezone="UTC"),
450466
),
451-
ibis_dtypes.date: (ibis_dtypes.string,),
452467
ibis_dtypes.Decimal(precision=38, scale=9): (
453468
ibis_dtypes.float64,
454469
ibis_dtypes.Decimal(precision=76, scale=38),
@@ -457,9 +472,24 @@ def cast_ibis_value(
457472
ibis_dtypes.float64,
458473
ibis_dtypes.Decimal(precision=38, scale=9),
459474
),
460-
ibis_dtypes.time: (),
461-
ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),),
462-
ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,),
475+
ibis_dtypes.time: (
476+
ibis_dtypes.int64,
477+
ibis_dtypes.string,
478+
),
479+
ibis_dtypes.timestamp: (
480+
ibis_dtypes.date,
481+
ibis_dtypes.int64,
482+
ibis_dtypes.string,
483+
ibis_dtypes.time,
484+
ibis_dtypes.Timestamp(timezone="UTC"),
485+
),
486+
ibis_dtypes.Timestamp(timezone="UTC"): (
487+
ibis_dtypes.date,
488+
ibis_dtypes.int64,
489+
ibis_dtypes.string,
490+
ibis_dtypes.time,
491+
ibis_dtypes.timestamp,
492+
),
463493
ibis_dtypes.binary: (ibis_dtypes.string,),
464494
}
465495

tests/system/small/test_series.py

+98
Original file line numberDiff line numberDiff line change
@@ -2625,6 +2625,9 @@ def foo(x):
26252625
("int64_col", "boolean"),
26262626
("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))),
26272627
("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))),
2628+
("int64_col", pd.ArrowDtype(pa.timestamp("us"))),
2629+
("int64_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
2630+
("int64_col", "time64[us][pyarrow]"),
26282631
("bool_col", "Int64"),
26292632
("bool_col", "string[pyarrow]"),
26302633
("string_col", "binary[pyarrow]"),
@@ -2633,9 +2636,17 @@ def foo(x):
26332636
# raises a deprecation warning to use tz_localize/tz_convert instead,
26342637
# but BigQuery always stores values as UTC and doesn't have to deal
26352638
# with timezone conversions, so we'll allow it.
2639+
("timestamp_col", "date32[day][pyarrow]"),
2640+
("timestamp_col", "time64[us][pyarrow]"),
26362641
("timestamp_col", pd.ArrowDtype(pa.timestamp("us"))),
2642+
("datetime_col", "date32[day][pyarrow]"),
2643+
("datetime_col", "string[pyarrow]"),
2644+
("datetime_col", "time64[us][pyarrow]"),
26372645
("datetime_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
26382646
("date_col", "string[pyarrow]"),
2647+
("date_col", pd.ArrowDtype(pa.timestamp("us"))),
2648+
("date_col", pd.ArrowDtype(pa.timestamp("us", tz="UTC"))),
2649+
("time_col", "string[pyarrow]"),
26392650
# TODO(bmil): fix Ibis bug: BigQuery backend rounds to nearest int
26402651
# ("float64_col", "Int64"),
26412652
# TODO(bmil): decide whether to fix Ibis bug: BigQuery backend
@@ -2653,6 +2664,24 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type):
26532664
pd.testing.assert_series_equal(bf_result, pd_result)
26542665

26552666

2667+
@pytest.mark.parametrize(
2668+
("column", "to_type"),
2669+
[
2670+
("timestamp_col", "int64[pyarrow]"),
2671+
("datetime_col", "int64[pyarrow]"),
2672+
("time_col", "int64[pyarrow]"),
2673+
],
2674+
)
2675+
@skip_legacy_pandas
2676+
def test_date_time_astype_int(
2677+
scalars_df_index, scalars_pandas_df_index, column, to_type
2678+
):
2679+
bf_result = scalars_df_index[column].astype(to_type).to_pandas()
2680+
pd_result = scalars_pandas_df_index[column].astype(to_type)
2681+
pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
2682+
assert bf_result.dtype == "Int64"
2683+
2684+
26562685
def test_string_astype_int():
26572686
pd_series = pd.Series(["4", "-7", "0", " -03"])
26582687
bf_series = series.Series(pd_series)
@@ -2676,6 +2705,75 @@ def test_string_astype_float():
26762705
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
26772706

26782707

2708+
def test_string_astype_date():
2709+
pd_series = pd.Series(["2014-08-15", "2215-08-15", "2016-02-29"]).astype(
2710+
pd.ArrowDtype(pa.string())
2711+
)
2712+
2713+
bf_series = series.Series(pd_series)
2714+
2715+
pd_result = pd_series.astype("date32[day][pyarrow]")
2716+
bf_result = bf_series.astype("date32[day][pyarrow]").to_pandas()
2717+
2718+
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
2719+
2720+
2721+
def test_string_astype_datetime():
2722+
pd_series = pd.Series(
2723+
["2014-08-15 08:15:12", "2015-08-15 08:15:12.654754", "2016-02-29 00:00:00"]
2724+
).astype(pd.ArrowDtype(pa.string()))
2725+
2726+
bf_series = series.Series(pd_series)
2727+
2728+
pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us")))
2729+
bf_result = bf_series.astype(pd.ArrowDtype(pa.timestamp("us"))).to_pandas()
2730+
2731+
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
2732+
2733+
2734+
def test_string_astype_timestamp():
2735+
pd_series = pd.Series(
2736+
[
2737+
"2014-08-15 08:15:12+00:00",
2738+
"2015-08-15 08:15:12.654754+05:00",
2739+
"2016-02-29 00:00:00+08:00",
2740+
]
2741+
).astype(pd.ArrowDtype(pa.string()))
2742+
2743+
bf_series = series.Series(pd_series)
2744+
2745+
pd_result = pd_series.astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC")))
2746+
bf_result = bf_series.astype(
2747+
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
2748+
).to_pandas()
2749+
2750+
pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False)
2751+
2752+
2753+
def test_timestamp_astype_string():
2754+
bf_series = series.Series(
2755+
[
2756+
"2014-08-15 08:15:12+00:00",
2757+
"2015-08-15 08:15:12.654754+05:00",
2758+
"2016-02-29 00:00:00+08:00",
2759+
]
2760+
).astype(pd.ArrowDtype(pa.timestamp("us", tz="UTC")))
2761+
2762+
expected_result = pd.Series(
2763+
[
2764+
"2014-08-15 08:15:12+00",
2765+
"2015-08-15 03:15:12.654754+00",
2766+
"2016-02-28 16:00:00+00",
2767+
]
2768+
)
2769+
bf_result = bf_series.astype(pa.string()).to_pandas()
2770+
2771+
pd.testing.assert_series_equal(
2772+
bf_result, expected_result, check_index_type=False, check_dtype=False
2773+
)
2774+
assert bf_result.dtype == "string[pyarrow]"
2775+
2776+
26792777
@pytest.mark.parametrize(
26802778
"index",
26812779
[0, 5, -2],

0 commit comments

Comments
 (0)