15
15
"""Mappings for Pandas dtypes supported by BigQuery DataFrames package"""
16
16
17
17
import datetime
18
+ import decimal
18
19
import textwrap
19
20
import typing
20
21
from typing import Any , Dict , Iterable , Literal , Tuple , Union
30
31
31
32
import bigframes .constants as constants
32
33
import third_party .bigframes_vendored .google_cloud_bigquery ._pandas_helpers as gcb3p_pandas_helpers
34
+ import third_party .bigframes_vendored .ibis .expr .operations as vendored_ibis_ops
33
35
34
36
# Type hints for Pandas dtypes supported by BigQuery DataFrame
35
37
Dtype = Union [
40
42
pd .ArrowDtype ,
41
43
]
42
44
43
- # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
44
- NUMERIC_BIGFRAMES_TYPES = [pd .BooleanDtype (), pd .Float64Dtype (), pd .Int64Dtype ()]
45
-
46
45
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
47
46
UNORDERED_DTYPES = [gpd .array .GeometryDtype ()]
48
47
57
56
"timestamp[us][pyarrow]" ,
58
57
"date32[day][pyarrow]" ,
59
58
"time64[us][pyarrow]" ,
59
+ "decimal128(38, 9)[pyarrow]" ,
60
+ "decimal256(38, 9)[pyarrow]" ,
61
+ "binary[pyarrow]" ,
60
62
]
61
63
62
64
# Type hints for Ibis data types supported by BigQuery DataFrame
72
74
73
75
BOOL_BIGFRAMES_TYPES = [pd .BooleanDtype ()]
74
76
75
- # Several operations are restricted to these types.
76
- NUMERIC_BIGFRAMES_TYPES = [pd .BooleanDtype (), pd .Float64Dtype (), pd .Int64Dtype ()]
77
+ # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
78
+ # Pandas is inconsistent, so two definitions are provided, each used in different contexts
79
+ NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
80
+ pd .Float64Dtype (),
81
+ pd .Int64Dtype (),
82
+ ]
83
+ NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [
84
+ pd .BooleanDtype (),
85
+ pd .ArrowDtype (pa .decimal128 (38 , 9 )),
86
+ pd .ArrowDtype (pa .decimal256 (76 , 38 )),
87
+ ]
77
88
78
89
# Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame
79
90
ReadOnlyIbisDtype = Union [
97
108
ibis_dtypes .Timestamp (timezone = "UTC" ),
98
109
pd .ArrowDtype (pa .timestamp ("us" , tz = "UTC" )),
99
110
),
111
+ (ibis_dtypes .binary , pd .ArrowDtype (pa .binary ())),
112
+ (
113
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 , nullable = True ),
114
+ pd .ArrowDtype (pa .decimal128 (38 , 9 )),
115
+ ),
116
+ (
117
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 , nullable = True ),
118
+ pd .ArrowDtype (pa .decimal256 (76 , 38 )),
119
+ ),
100
120
)
101
121
102
122
BIGFRAMES_TO_IBIS : Dict [Dtype , ibis_dtypes .DataType ] = {
112
132
ibis_dtypes .time : pa .time64 ("us" ),
113
133
ibis_dtypes .Timestamp (timezone = None ): pa .timestamp ("us" ),
114
134
ibis_dtypes .Timestamp (timezone = "UTC" ): pa .timestamp ("us" , tz = "UTC" ),
135
+ ibis_dtypes .binary : pa .binary (),
136
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 , nullable = True ): pa .decimal128 (38 , 9 ),
137
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 , nullable = True ): pa .decimal256 (76 , 38 ),
115
138
}
116
139
117
140
ARROW_TO_IBIS = {arrow : ibis for ibis , arrow in IBIS_TO_ARROW .items ()}
125
148
)
126
149
IBIS_TO_BIGFRAMES .update (
127
150
{
128
- ibis_dtypes .binary : np .dtype ("O" ),
129
- ibis_dtypes .json : np .dtype ("O" ),
130
- ibis_dtypes .Decimal (precision = 38 , scale = 9 , nullable = True ): np .dtype ("O" ),
131
- ibis_dtypes .Decimal (precision = 76 , scale = 38 , nullable = True ): np .dtype ("O" ),
132
151
ibis_dtypes .GeoSpatial (
133
152
geotype = "geography" , srid = 4326 , nullable = True
134
153
): gpd .array .GeometryDtype (),
@@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype(
178
197
# our IO returns them as objects. Eventually, we should support them as
179
198
# ArrowDType (and update the IO accordingly)
180
199
if isinstance (ibis_dtype , ibis_dtypes .Array ):
181
- return np . dtype ( "O" )
200
+ return pd . ArrowDtype ( ibis_dtype_to_arrow_dtype ( ibis_dtype ) )
182
201
183
202
if isinstance (ibis_dtype , ibis_dtypes .Struct ):
184
203
return pd .ArrowDtype (ibis_dtype_to_arrow_dtype (ibis_dtype ))
@@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype(
200
219
201
220
def ibis_dtype_to_arrow_dtype (ibis_dtype : ibis_dtypes .DataType ) -> pa .DataType :
202
221
if isinstance (ibis_dtype , ibis_dtypes .Array ):
203
- return pa .list_ (ibis_dtype_to_arrow_dtype (ibis_dtype .value_type ))
222
+ return pa .list_ (
223
+ ibis_dtype_to_arrow_dtype (ibis_dtype .value_type .copy (nullable = True ))
224
+ )
204
225
205
226
if isinstance (ibis_dtype , ibis_dtypes .Struct ):
206
227
return pa .struct (
@@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
224
245
This is useful in cases where multiple types correspond to the same BigFrames dtype.
225
246
"""
226
247
ibis_type = value .type ()
248
+ name = value .get_name ()
249
+ if ibis_type .is_json ():
250
+ value = vendored_ibis_ops .ToJsonString (value ).to_expr ()
251
+ return value .name (name )
227
252
# Allow REQUIRED fields to be joined with NULLABLE fields.
228
253
nullable_type = ibis_type .copy (nullable = True )
229
- return value .cast (nullable_type ).name (value .get_name ())
230
-
231
-
232
- def ibis_table_to_canonical_types (table : ibis_types .Table ) -> ibis_types .Table :
233
- """Converts an Ibis table expression to canonical types.
234
-
235
- This is useful in cases where multiple types correspond to the same BigFrames dtype.
236
- """
237
- casted_columns = []
238
- for column_name in table .columns :
239
- column = typing .cast (ibis_types .Value , table [column_name ])
240
- casted_columns .append (ibis_value_to_canonical_type (column ))
241
- return table .select (* casted_columns )
254
+ return value .cast (nullable_type ).name (name )
242
255
243
256
244
257
def arrow_dtype_to_ibis_dtype (arrow_dtype : pa .DataType ) -> ibis_dtypes .DataType :
@@ -386,15 +399,35 @@ def cast_ibis_value(
386
399
ibis_dtypes .bool ,
387
400
ibis_dtypes .float64 ,
388
401
ibis_dtypes .string ,
402
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 ),
403
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 ),
404
+ ),
405
+ ibis_dtypes .float64 : (
406
+ ibis_dtypes .string ,
407
+ ibis_dtypes .int64 ,
408
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 ),
409
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 ),
410
+ ),
411
+ ibis_dtypes .string : (
412
+ ibis_dtypes .int64 ,
413
+ ibis_dtypes .float64 ,
414
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 ),
415
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 ),
416
+ ibis_dtypes .binary ,
389
417
),
390
- ibis_dtypes .float64 : (ibis_dtypes .string , ibis_dtypes .int64 ),
391
- ibis_dtypes .string : (ibis_dtypes .int64 , ibis_dtypes .float64 ),
392
418
ibis_dtypes .date : (ibis_dtypes .string ,),
393
- ibis_dtypes .Decimal (precision = 38 , scale = 9 ): (ibis_dtypes .float64 ,),
394
- ibis_dtypes .Decimal (precision = 76 , scale = 38 ): (ibis_dtypes .float64 ,),
419
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 ): (
420
+ ibis_dtypes .float64 ,
421
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 ),
422
+ ),
423
+ ibis_dtypes .Decimal (precision = 76 , scale = 38 ): (
424
+ ibis_dtypes .float64 ,
425
+ ibis_dtypes .Decimal (precision = 38 , scale = 9 ),
426
+ ),
395
427
ibis_dtypes .time : (),
396
428
ibis_dtypes .timestamp : (ibis_dtypes .Timestamp (timezone = "UTC" ),),
397
429
ibis_dtypes .Timestamp (timezone = "UTC" ): (ibis_dtypes .timestamp ,),
430
+ ibis_dtypes .binary : (ibis_dtypes .string ,),
398
431
}
399
432
400
433
value = ibis_value_to_canonical_type (value )
@@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool:
458
491
return False
459
492
460
493
494
+ # string is binary
461
495
def is_patype (scalar : typing .Any , pa_type : pa .DataType ) -> bool :
462
496
"""Determine whether a scalar's type matches a given pyarrow type."""
463
497
if pa_type == pa .time64 ("us" ):
464
498
return isinstance (scalar , datetime .time )
465
- if pa_type == pa .timestamp ("us" ):
499
+ elif pa_type == pa .timestamp ("us" ):
466
500
if isinstance (scalar , datetime .datetime ):
467
501
return not scalar .tzinfo
468
502
if isinstance (scalar , pd .Timestamp ):
469
503
return not scalar .tzinfo
470
- if pa_type == pa .timestamp ("us" , tz = "UTC" ):
504
+ elif pa_type == pa .timestamp ("us" , tz = "UTC" ):
471
505
if isinstance (scalar , datetime .datetime ):
472
506
return scalar .tzinfo == datetime .timezone .utc
473
507
if isinstance (scalar , pd .Timestamp ):
474
508
return scalar .tzinfo == datetime .timezone .utc
475
- if pa_type == pa .date32 ():
509
+ elif pa_type == pa .date32 ():
476
510
return isinstance (scalar , datetime .date )
511
+ elif pa_type == pa .binary ():
512
+ return isinstance (scalar , bytes )
513
+ elif pa_type == pa .decimal128 (38 , 9 ):
514
+ # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
515
+ return isinstance (scalar , decimal .Decimal )
516
+ elif pa_type == pa .decimal256 (76 , 38 ):
517
+ # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks
518
+ return isinstance (scalar , decimal .Decimal )
477
519
return False
478
520
479
521
480
- def is_comparable (scalar : typing .Any , dtype : Dtype ) -> bool :
481
- """Whether scalar can be compare to items of dtype (though maybe requiring coercion)"""
522
+ def is_compatible (scalar : typing .Any , dtype : Dtype ) -> typing . Optional [ Dtype ] :
523
+ """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison """
482
524
if is_dtype (scalar , dtype ):
483
- return True
525
+ return dtype
484
526
elif pd .api .types .is_numeric_dtype (dtype ):
485
- return pd .api .types .is_number (scalar )
486
- else :
487
- return False
527
+ # Implicit conversion currently only supported for numeric types
528
+ if pd .api .types .is_bool (scalar ):
529
+ return lcd_type (pd .BooleanDtype (), dtype )
530
+ if pd .api .types .is_float (scalar ):
531
+ return lcd_type (pd .Float64Dtype (), dtype )
532
+ if pd .api .types .is_integer (scalar ):
533
+ return lcd_type (pd .Int64Dtype (), dtype )
534
+ if isinstance (scalar , decimal .Decimal ):
535
+ # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC
536
+ return lcd_type (pd .ArrowDtype (pa .decimal128 (76 , 38 )), dtype )
537
+ return None
538
+
539
+
540
+ def lcd_type (dtype1 : Dtype , dtype2 : Dtype ) -> typing .Optional [Dtype ]:
541
+ # Implicit conversion currently only supported for numeric types
542
+ hierarchy : list [Dtype ] = [
543
+ pd .BooleanDtype (),
544
+ pd .Int64Dtype (),
545
+ pd .Float64Dtype (),
546
+ pd .ArrowDtype (pa .decimal128 (38 , 9 )),
547
+ pd .ArrowDtype (pa .decimal256 (76 , 38 )),
548
+ ]
549
+ if (dtype1 not in hierarchy ) or (dtype2 not in hierarchy ):
550
+ return None
551
+ lcd_index = max (hierarchy .index (dtype1 ), hierarchy .index (dtype2 ))
552
+ return hierarchy [lcd_index ]
0 commit comments