Skip to content

Commit c2b91e2

Browse files
committedJul 11, 2019
try adding frame
1 parent 2b9e2e6 commit c2b91e2

File tree

5 files changed

+181
-10
lines changed

5 files changed

+181
-10
lines changed
 

‎doc/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ Reshaping, sorting, transposing
239239
DataFrame.unstack
240240
DataFrame.swapaxes
241241
DataFrame.melt
242+
DataFrame.explode
242243
DataFrame.squeeze
243244
DataFrame.to_xarray
244245
DataFrame.T

‎pandas/core/frame.py

+73-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import itertools
1616
import sys
1717
from textwrap import dedent
18-
from typing import FrozenSet, List, Optional, Set, Type, Union
18+
from typing import FrozenSet, Iterable, List, Optional, Set, Type, Union
1919
import warnings
2020

2121
import numpy as np
@@ -6252,6 +6252,78 @@ def stack(self, level=-1, dropna=True):
62526252
else:
62536253
return stack(self, level, dropna=dropna)
62546254

6255+
def explode(self, subset: Iterable) -> "DataFrame":
6256+
"""
6257+
Create new DataFrame expanding a list-like columns.
6258+
6259+
.. versionadded:: 0.25.0
6260+
6261+
Parameters
6262+
----------
6263+
subset : list-like
6264+
6265+
Returns
6266+
-------
6267+
DataFrame
6268+
Exploded lists to rows of the subset columns; index will be duplicated for these rows.
6269+
6270+
Raises
6271+
------
6272+
ValueError :
6273+
if columns & subset are not unique.
6274+
ValueError :
6275+
subset must be list-like
6276+
6277+
See Also
6278+
--------
6279+
Series.str.split : Split string values on specified separator.
6280+
Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
6281+
DataFrame.melt : Unpivot a DataFrame from wide format to long format
6282+
Series.explode : Explode a DataFrame from list-like columns to long format.
6283+
6284+
Notes
6285+
-----
6286+
This routine will explode list-likes including lists, tuples, Series, and np.ndarray.
6287+
The result dtype of the subset rows will be object.
6288+
Scalars will be returned unchanged.
6289+
Empty list-likes will result in a np.nan for that row.
6290+
6291+
Examples
6292+
--------
6293+
In [1]: df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
6294+
6295+
In [3]: df.explode()
6296+
Out[3]:
6297+
0 1
6298+
0 2
6299+
0 3
6300+
1 foo
6301+
2 NaN
6302+
3 3
6303+
3 4
6304+
dtype: object
6305+
"""
6306+
6307+
if not is_list_like(subset):
6308+
raise ValueError("subset must be a list-like")
6309+
if not Index(subset).is_unique:
6310+
raise ValueError("subset must be unique")
6311+
if not self.columns.is_unique:
6312+
raise ValueError("columns must be unique")
6313+
6314+
results = [self[s].explode() for s in subset]
6315+
result = self.drop(subset, axis=1)
6316+
6317+
# recursive merge
6318+
from pandas.core.reshape.merge import merge
6319+
6320+
def merger(left, right):
6321+
return merge(left, right, left_index=True, right_index=True)
6322+
6323+
return functools.reduce(merger, [result] + results).reindex(
6324+
columns=self.columns, copy=False
6325+
)
6326+
62556327
def unstack(self, level=-1, fill_value=None):
62566328
"""
62576329
Pivot a level of the (necessarily hierarchical) index labels, returning

‎pandas/core/series.py

+1
Original file line numberDiff line numberDiff line change
@@ -3654,6 +3654,7 @@ def explode(self) -> "Series":
36543654
Series.str.split : Split string values on specified separator.
36553655
Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
36563656
DataFrame.melt : Unpivot a DataFrame from wide format to long format
3657+
DataFrame.explode : Explode a DataFrame from list-like columns to long format.
36573658
36583659
Notes
36593660
-----

‎pandas/tests/frame/test_explode.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas as pd
5+
from pandas.util import testing as tm
6+
7+
8+
def test_error():
9+
df = pd.DataFrame(
10+
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
11+
)
12+
df.columns = list("AA")
13+
with pytest.raises(ValueError):
14+
df.explode(subset=list("AA"))
15+
16+
17+
def test_basic():
18+
df = pd.DataFrame(
19+
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
20+
)
21+
result = df.explode(subset=["A"])
22+
expected = pd.DataFrame(
23+
{
24+
"A": pd.Series(
25+
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
26+
),
27+
"B": 1,
28+
}
29+
)
30+
tm.assert_frame_equal(result, expected)
31+
32+
33+
def test_all_columns():
34+
df = pd.DataFrame(
35+
{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
36+
)
37+
result = df.explode(subset=["A", "B"])
38+
expected = pd.DataFrame(
39+
{
40+
"A": pd.Series(
41+
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
42+
),
43+
"B": 1,
44+
}
45+
)
46+
tm.assert_frame_equal(result, expected)
47+
48+
49+
def test_multiple_columns():
50+
df = pd.DataFrame(
51+
{
52+
"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")),
53+
"B": pd.Series([[0, 1, 2], np.nan, np.nan, 3], index=list("abcd")),
54+
}
55+
)
56+
result = df.explode(subset=["A", "B"])
57+
expected = pd.DataFrame(
58+
{
59+
"A": [0, 0, 0, 1, 1, 1, 2, 2, 2, np.nan, np.nan, 3, 4],
60+
"B": [0, 1, 2, 0, 1, 2, 0, 1, 2, np.nan, np.nan, 3, 3],
61+
},
62+
dtype=object,
63+
index=list("aaaaaaaaabcdd"),
64+
)
65+
tm.assert_frame_equal(result, expected)
66+
67+
68+
def test_usecase():
69+
# explode a single column
70+
# gh-10511
71+
df = pd.DataFrame(
72+
[[11, range(5), 10], [22, range(3), 20]], columns=["A", "B", "C"]
73+
).set_index("C")
74+
result = df.explode(["B"])
75+
76+
expected = pd.DataFrame(
77+
{
78+
"A": [11, 11, 11, 11, 11, 22, 22, 22],
79+
"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
80+
"C": [10, 10, 10, 10, 10, 20, 20, 20],
81+
},
82+
columns=list("ABC"),
83+
).set_index("C")
84+
85+
tm.assert_frame_equal(result, expected)
86+
87+
# gh-8517
88+
df = pd.DataFrame(
89+
[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
90+
columns=["dt", "name", "text"],
91+
)
92+
result = df.assign(text=df.text.str.split(" ")).explode(["text"])
93+
expected = pd.DataFrame(
94+
[
95+
["2014-01-01", "Alice", "A"],
96+
["2014-01-01", "Alice", "B"],
97+
["2014-01-02", "Bob", "C"],
98+
["2014-01-02", "Bob", "D"],
99+
],
100+
columns=["dt", "name", "text"],
101+
index=[0, 0, 1, 1],
102+
)
103+
tm.assert_frame_equal(result, expected)

‎pandas/tests/series/test_explode.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,10 @@
66

77

88
def test_basic():
9-
s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)],
10-
index=list('abcd'),
11-
name="foo")
9+
s = pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd"), name="foo")
1210
result = s.explode()
1311
expected = pd.Series(
14-
[0, 1, 2, np.nan, np.nan, 3, 4],
15-
index=list('aaabcdd'),
16-
dtype=object,
17-
name="foo",
12+
[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object, name="foo"
1813
)
1914
tm.assert_series_equal(result, expected)
2015

@@ -43,8 +38,7 @@ def test_empty():
4338
def test_nested_lists():
4439
s = pd.Series([[[1, 2, 3]], [1, 2], 1])
4540
result = s.explode()
46-
expected = pd.Series([[1, 2, 3], 1, 2, 1],
47-
index=[0, 1, 1, 2])
41+
expected = pd.Series([[1, 2, 3], 1, 2, 1], index=[0, 1, 1, 2])
4842
tm.assert_series_equal(result, expected)
4943

5044

0 commit comments

Comments
 (0)