Skip to content

Commit 9aef5a3

Browse files
committed
address comments
1 parent e0a90eb commit 9aef5a3

File tree

5 files changed

+105
-72
lines changed

5 files changed

+105
-72
lines changed

bigframes/core/__init__.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -402,17 +402,13 @@ def join(
402402
return ArrayValue(join_node)
403403

404404
def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
405-
column_ids = [
406-
column_id
407-
for column_id in column_ids
408-
if bigframes.dtypes.is_array_like(self.get_column_type(column_id))
409-
]
410-
if len(column_ids) == 0:
411-
return ArrayValue(self.node)
412-
else:
413-
return ArrayValue(
414-
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
415-
)
405+
assert len(column_ids) > 0
406+
for column_id in column_ids:
407+
assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
408+
409+
return ArrayValue(
410+
nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
411+
)
416412

417413
def _uniform_sampling(self, fraction: float) -> ArrayValue:
418414
"""Sampling the table on given fraction.

bigframes/core/blocks.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -1167,11 +1167,21 @@ def explode(
11671167
column_ids: typing.Sequence[str],
11681168
ignore_index: Optional[bool],
11691169
) -> Block:
1170-
expr = self.expr.explode(column_ids)
1170+
column_ids = [
1171+
column_id
1172+
for column_id in column_ids
1173+
if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id))
1174+
]
1175+
if len(column_ids) == 0:
1176+
expr = self.expr
1177+
else:
1178+
expr = self.expr.explode(column_ids)
1179+
11711180
if ignore_index:
11721181
return Block(
11731182
expr.drop_columns(self.index_columns),
11741183
column_labels=self.column_labels,
1184+
# Initiates default index creation using the block constructor.
11751185
index_columns=[],
11761186
)
11771187
else:

bigframes/core/compile/compiled.py

+46-51
Original file line numberDiff line numberDiff line change
@@ -505,40 +505,42 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
505505

506506
def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
507507
table = self._to_ibis_expr()
508-
other_columns = [
509-
column_id for column_id in self._column_names if column_id not in column_ids
510-
]
511508

512-
if len(column_ids) == 1:
513-
unnested_column = table[column_ids[0]].unnest().name(column_ids[0])
514-
table_w_unnest = table.select(
515-
unnested_column,
516-
*other_columns,
517-
)
518-
else:
519-
zip_array_id = bigframes.core.guid.generate_guid("zip_array_")
520-
zip_array = (
521-
table[column_ids[0]]
522-
.zip(*[table[column_id] for column_id in column_ids[1:]])
523-
.name(zip_array_id)
524-
)
525-
table_w_zip_array = table.select(
526-
zip_array,
527-
*self._column_names,
509+
# The offset array ensures null represents empty arrays after unnesting.
510+
offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
511+
offset_array = (
512+
vendored_ibis_ops.GenerateArray(
513+
ibis.greatest(
514+
0,
515+
ibis.least(
516+
*[table[column_id].length() - 1 for column_id in column_ids]
517+
),
518+
)
528519
)
520+
.to_expr()
521+
.name(offset_array_id),
522+
)
523+
table_w_offset_array = table.select(
524+
offset_array,
525+
*self._column_names,
526+
)
529527

530-
unnest_array_id = bigframes.core.guid.generate_guid("unnest_array_")
531-
unnest_array = (
532-
table_w_zip_array[zip_array_id].unnest().name(unnest_array_id)
533-
)
534-
unnested_columns = [
535-
unnest_array[f"f{index+1}"].name(column_id)
536-
for index, column_id in zip(range(len(column_ids)), column_ids)
537-
]
538-
table_w_unnest = table_w_zip_array.select(
539-
*unnested_columns,
540-
*other_columns,
541-
)
528+
unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
529+
unnest_offset = (
530+
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
531+
)
532+
table_w_offset = table_w_offset_array.select(
533+
unnest_offset,
534+
*self._column_names,
535+
)
536+
537+
unnested_columns = [
538+
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
539+
if column_id in column_ids
540+
else table_w_offset[column_id]
541+
for column_id in self._column_names
542+
]
543+
table_w_unnest = table_w_offset.select(*unnested_columns)
542544

543545
columns = [table_w_unnest[column_name] for column_name in self._column_names]
544546
return UnorderedIR(
@@ -779,39 +781,32 @@ def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
779781
.to_expr()
780782
.name(offset_array_id),
781783
)
782-
table_w_offset = table.select(
784+
table_w_offset_array = table.select(
783785
offset_array,
784786
*self._column_names,
785787
*self._hidden_ordering_column_names,
786788
)
787789

788-
zip_array_id = bigframes.core.guid.generate_guid("zip_array_")
789-
zip_array = (
790-
table_w_offset[offset_array_id]
791-
.zip(*[table_w_offset[column_id] for column_id in column_ids])
792-
.name(zip_array_id)
790+
unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
791+
unnest_offset = (
792+
table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
793793
)
794-
table_w_zip_array = table_w_offset.select(
795-
zip_array,
794+
table_w_offset = table_w_offset_array.select(
795+
unnest_offset,
796796
*self._column_names,
797797
*self._hidden_ordering_column_names,
798798
)
799799

800-
unnest_array_id = bigframes.core.guid.generate_guid("unnest_array_")
801-
unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
802-
803-
unnest_array = table_w_zip_array[zip_array_id].unnest().name(unnest_array_id)
804800
unnested_columns = [
805-
unnest_array[f"f{index+2}"].name(column_id)
806-
for index, column_id in zip(range(len(column_ids)), column_ids)
801+
table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
802+
if column_id in column_ids
803+
else table_w_offset[column_id]
804+
for column_id in self._column_names
807805
]
808-
other_columns = [
809-
column_id for column_id in self._column_names if column_id not in column_ids
810-
]
811-
table_w_unnest = table_w_zip_array.select(
812-
unnest_array["f1"].name(unnest_offset_id),
806+
807+
table_w_unnest = table_w_offset.select(
808+
table_w_offset[unnest_offset_id],
813809
*unnested_columns,
814-
*other_columns,
815810
*self._hidden_ordering_column_names,
816811
)
817812

tests/system/small/test_dataframe.py

+27
Original file line numberDiff line numberDiff line change
@@ -4159,6 +4159,33 @@ def test_dataframe_explode(col_names, ignore_index):
41594159
)
41604160

41614161

4162+
@pytest.mark.parametrize(
4163+
("ignore_index", "ordered"),
4164+
[
4165+
pytest.param(True, True, id="include_index_ordered"),
4166+
pytest.param(True, False, id="include_index_unordered"),
4167+
pytest.param(False, True, id="ignore_index_ordered"),
4168+
],
4169+
)
4170+
def test_dataframe_explode_reserve_order(ignore_index, ordered):
4171+
data = {
4172+
"a": [np.random.randint(0, 10, 10) for _ in range(10)],
4173+
"b": [np.random.randint(0, 10, 10) for _ in range(10)],
4174+
}
4175+
df = bpd.DataFrame(data)
4176+
pd_df = pd.DataFrame(data)
4177+
4178+
res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered)
4179+
pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype(
4180+
pd.Int64Dtype()
4181+
)
4182+
pd.testing.assert_series_equal(
4183+
res if ordered else res.sort_index(),
4184+
pd_res,
4185+
check_index_type=False,
4186+
)
4187+
4188+
41624189
@pytest.mark.parametrize(
41634190
("col_names"),
41644191
[

tests/system/small/test_series.py

+14-9
Original file line numberDiff line numberDiff line change
@@ -3421,9 +3421,10 @@ def foo(x: int, y: int, df):
34213421
@pytest.mark.parametrize(
34223422
("data"),
34233423
[
3424-
pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int"),
3424+
pytest.param([1, 2, 3], id="int"),
3425+
pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"),
34253426
pytest.param(
3426-
[["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string"
3427+
[["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array"
34273428
),
34283429
pytest.param(
34293430
[
@@ -3433,7 +3434,7 @@ def foo(x: int, y: int, df):
34333434
{},
34343435
numpy.nan,
34353436
],
3436-
id="struct",
3437+
id="struct_array",
34373438
),
34383439
],
34393440
)
@@ -3472,19 +3473,23 @@ def test_series_explode_w_index(index, ignore_index):
34723473

34733474

34743475
@pytest.mark.parametrize(
3475-
("ignore_index"),
3476+
("ignore_index", "ordered"),
34763477
[
3477-
pytest.param(True, id="include_index"),
3478-
pytest.param(False, id="ignore_index"),
3478+
pytest.param(True, True, id="include_index_ordered"),
3479+
pytest.param(True, False, id="include_index_unordered"),
3480+
pytest.param(False, True, id="ignore_index_ordered"),
34793481
],
34803482
)
3481-
def test_series_explode_reserve_order(ignore_index):
3483+
def test_series_explode_reserve_order(ignore_index, ordered):
34823484
data = [numpy.random.randint(0, 10, 10) for _ in range(10)]
34833485
s = bigframes.pandas.Series(data)
34843486
pd_s = pd.Series(data)
3487+
3488+
res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered)
3489+
pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype())
34853490
pd.testing.assert_series_equal(
3486-
s.explode(ignore_index=ignore_index).to_pandas(),
3487-
pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()),
3491+
res if ordered else res.sort_index(),
3492+
pd_res,
34883493
check_index_type=False,
34893494
)
34903495

0 commit comments

Comments
 (0)