address comments

chelsea-lin · chelsea-lin · commit 9aef5a37e85e · 2024-04-03T21:02:24.000Z
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
@@ -402,17 +402,13 @@ def join(
         return ArrayValue(join_node)
 
     def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue:
-        column_ids = [
-            column_id
-            for column_id in column_ids
-            if bigframes.dtypes.is_array_like(self.get_column_type(column_id))
-        ]
-        if len(column_ids) == 0:
-            return ArrayValue(self.node)
-        else:
-            return ArrayValue(
-                nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
-            )
+        assert len(column_ids) > 0
+        for column_id in column_ids:
+            assert bigframes.dtypes.is_array_like(self.get_column_type(column_id))
+
+        return ArrayValue(
+            nodes.ExplodeNode(child=self.node, column_ids=tuple(column_ids))
+        )
 
     def _uniform_sampling(self, fraction: float) -> ArrayValue:
         """Sampling the table on given fraction.
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1167,11 +1167,21 @@ def explode(
         column_ids: typing.Sequence[str],
         ignore_index: Optional[bool],
     ) -> Block:
-        expr = self.expr.explode(column_ids)
+        column_ids = [
+            column_id
+            for column_id in column_ids
+            if bigframes.dtypes.is_array_like(self.expr.get_column_type(column_id))
+        ]
+        if len(column_ids) == 0:
+            expr = self.expr
+        else:
+            expr = self.expr.explode(column_ids)
+
         if ignore_index:
             return Block(
                 expr.drop_columns(self.index_columns),
                 column_labels=self.column_labels,
+                # Initiates default index creation using the block constructor.
                 index_columns=[],
             )
         else:
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
@@ -505,40 +505,42 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
 
     def explode(self, column_ids: typing.Sequence[str]) -> UnorderedIR:
         table = self._to_ibis_expr()
-        other_columns = [
-            column_id for column_id in self._column_names if column_id not in column_ids
-        ]
 
-        if len(column_ids) == 1:
-            unnested_column = table[column_ids[0]].unnest().name(column_ids[0])
-            table_w_unnest = table.select(
-                unnested_column,
-                *other_columns,
-            )
-        else:
-            zip_array_id = bigframes.core.guid.generate_guid("zip_array_")
-            zip_array = (
-                table[column_ids[0]]
-                .zip(*[table[column_id] for column_id in column_ids[1:]])
-                .name(zip_array_id)
-            )
-            table_w_zip_array = table.select(
-                zip_array,
-                *self._column_names,
+        # The offset array ensures null represents empty arrays after unnesting.
+        offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
+        offset_array = (
+            vendored_ibis_ops.GenerateArray(
+                ibis.greatest(
+                    0,
+                    ibis.least(
+                        *[table[column_id].length() - 1 for column_id in column_ids]
+                    ),
+                )
             )
+            .to_expr()
+            .name(offset_array_id),
+        )
+        table_w_offset_array = table.select(
+            offset_array,
+            *self._column_names,
+        )
 
-            unnest_array_id = bigframes.core.guid.generate_guid("unnest_array_")
-            unnest_array = (
-                table_w_zip_array[zip_array_id].unnest().name(unnest_array_id)
-            )
-            unnested_columns = [
-                unnest_array[f"f{index+1}"].name(column_id)
-                for index, column_id in zip(range(len(column_ids)), column_ids)
-            ]
-            table_w_unnest = table_w_zip_array.select(
-                *unnested_columns,
-                *other_columns,
-            )
+        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
+        unnest_offset = (
+            table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
+        )
+        table_w_offset = table_w_offset_array.select(
+            unnest_offset,
+            *self._column_names,
+        )
+
+        unnested_columns = [
+            table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
+            if column_id in column_ids
+            else table_w_offset[column_id]
+            for column_id in self._column_names
+        ]
+        table_w_unnest = table_w_offset.select(*unnested_columns)
 
         columns = [table_w_unnest[column_name] for column_name in self._column_names]
         return UnorderedIR(
@@ -779,39 +781,32 @@ def explode(self, column_ids: typing.Sequence[str]) -> OrderedIR:
             .to_expr()
             .name(offset_array_id),
         )
-        table_w_offset = table.select(
+        table_w_offset_array = table.select(
             offset_array,
             *self._column_names,
             *self._hidden_ordering_column_names,
         )
 
-        zip_array_id = bigframes.core.guid.generate_guid("zip_array_")
-        zip_array = (
-            table_w_offset[offset_array_id]
-            .zip(*[table_w_offset[column_id] for column_id in column_ids])
-            .name(zip_array_id)
+        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
+        unnest_offset = (
+            table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
         )
-        table_w_zip_array = table_w_offset.select(
-            zip_array,
+        table_w_offset = table_w_offset_array.select(
+            unnest_offset,
             *self._column_names,
             *self._hidden_ordering_column_names,
         )
 
-        unnest_array_id = bigframes.core.guid.generate_guid("unnest_array_")
-        unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_")
-
-        unnest_array = table_w_zip_array[zip_array_id].unnest().name(unnest_array_id)
         unnested_columns = [
-            unnest_array[f"f{index+2}"].name(column_id)
-            for index, column_id in zip(range(len(column_ids)), column_ids)
+            table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
+            if column_id in column_ids
+            else table_w_offset[column_id]
+            for column_id in self._column_names
         ]
-        other_columns = [
-            column_id for column_id in self._column_names if column_id not in column_ids
-        ]
-        table_w_unnest = table_w_zip_array.select(
-            unnest_array["f1"].name(unnest_offset_id),
+
+        table_w_unnest = table_w_offset.select(
+            table_w_offset[unnest_offset_id],
             *unnested_columns,
-            *other_columns,
             *self._hidden_ordering_column_names,
         )
 
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -4159,6 +4159,33 @@ def test_dataframe_explode(col_names, ignore_index):
     )
 
 
+@pytest.mark.parametrize(
+    ("ignore_index", "ordered"),
+    [
+        pytest.param(True, True, id="include_index_ordered"),
+        pytest.param(True, False, id="include_index_unordered"),
+        pytest.param(False, True, id="ignore_index_ordered"),
+    ],
+)
+def test_dataframe_explode_reserve_order(ignore_index, ordered):
+    data = {
+        "a": [np.random.randint(0, 10, 10) for _ in range(10)],
+        "b": [np.random.randint(0, 10, 10) for _ in range(10)],
+    }
+    df = bpd.DataFrame(data)
+    pd_df = pd.DataFrame(data)
+
+    res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered)
+    pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype(
+        pd.Int64Dtype()
+    )
+    pd.testing.assert_series_equal(
+        res if ordered else res.sort_index(),
+        pd_res,
+        check_index_type=False,
+    )
+
+
 @pytest.mark.parametrize(
     ("col_names"),
     [
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
@@ -3421,9 +3421,10 @@ def foo(x: int, y: int, df):
 @pytest.mark.parametrize(
     ("data"),
     [
-        pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int"),
+        pytest.param([1, 2, 3], id="int"),
+        pytest.param([[1, 2, 3], [], numpy.nan, [3, 4]], id="int_array"),
         pytest.param(
-            [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string"
+            [["A", "AA", "AAA"], ["BB", "B"], numpy.nan, [], ["C"]], id="string_array"
         ),
         pytest.param(
             [
@@ -3433,7 +3434,7 @@ def foo(x: int, y: int, df):
                 {},
                 numpy.nan,
             ],
-            id="struct",
+            id="struct_array",
         ),
     ],
 )
@@ -3472,19 +3473,23 @@ def test_series_explode_w_index(index, ignore_index):
 
 
 @pytest.mark.parametrize(
-    ("ignore_index"),
+    ("ignore_index", "ordered"),
     [
-        pytest.param(True, id="include_index"),
-        pytest.param(False, id="ignore_index"),
+        pytest.param(True, True, id="include_index_ordered"),
+        pytest.param(True, False, id="include_index_unordered"),
+        pytest.param(False, True, id="ignore_index_ordered"),
     ],
 )
-def test_series_explode_reserve_order(ignore_index):
+def test_series_explode_reserve_order(ignore_index, ordered):
     data = [numpy.random.randint(0, 10, 10) for _ in range(10)]
     s = bigframes.pandas.Series(data)
     pd_s = pd.Series(data)
+
+    res = s.explode(ignore_index=ignore_index).to_pandas(ordered=ordered)
+    pd_res = pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype())
     pd.testing.assert_series_equal(
-        s.explode(ignore_index=ignore_index).to_pandas(),
-        pd_s.explode(ignore_index=ignore_index).astype(pd.Int64Dtype()),
+        res if ordered else res.sort_index(),
+        pd_res,
         check_index_type=False,
     )