Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add Series method to explode a list-like column #27267

Merged
merged 34 commits into from
Jul 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ed949ae
[ENH] Add DataFrame method to explode a list-like column (GH #16538)
changhiskhan Dec 20, 2018
959ed4c
move to Series
jreback Jul 6, 2019
1e4b4bd
handle generic list-like
jreback Jul 6, 2019
ef9eae8
lint on asv
jreback Jul 6, 2019
790f06e
move is_list_like to cython and share impl
jreback Jul 6, 2019
d8e4801
moar docs
jreback Jul 6, 2019
f055b48
test larger sides to avoid a segfault
jreback Jul 6, 2019
bd854ca
fix ref
jreback Jul 6, 2019
774d285
typos
jreback Jul 6, 2019
854a2af
benchmarks wrong
jreback Jul 6, 2019
c4d1bd3
add inversion
jreback Jul 6, 2019
01edb45
add usecase
jreback Jul 6, 2019
af05813
cimport is_list_like
jreback Jul 6, 2019
84e1996
use cimports
jreback Jul 6, 2019
040ba2e
doc-string
jreback Jul 6, 2019
93b1fd6
docs & lint
jreback Jul 6, 2019
fdafb0c
isort
jreback Jul 6, 2019
b47e68d
clean object check & update doc-strings
jreback Jul 7, 2019
9856732
lint
jreback Jul 8, 2019
0fa9c0d
test for nested
jreback Jul 9, 2019
c426627
better test
jreback Jul 9, 2019
9d20b61
try adding frame
jreback Jul 9, 2019
5512e29
test for nested EA
jreback Jul 11, 2019
dc17ef6
lint
jreback Jul 11, 2019
94f319e
remove multi subset support
jreback Jul 11, 2019
b9d8a42
update docs
jreback Jul 11, 2019
df0fccf
doc-string
jreback Jul 11, 2019
720e309
add test for MI
jreback Jul 11, 2019
26b91c4
lint and docs
jreback Jul 11, 2019
d51aa47
ordering
jreback Jul 11, 2019
f9f82fc
moar lint
jreback Jul 11, 2019
5c4635f
multi-index column support
jreback Jul 17, 2019
7fc2159
32-bit compat
jreback Jul 17, 2019
4e7755b
moar 32-bit compat
jreback Jul 17, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.PHONY : develop build clean clean_pyc doc lint-diff black

all: develop

clean:
-python setup.py clean

Expand Down
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
pass


class DoesStringLookLikeDatetime(object):
class DoesStringLookLikeDatetime:

params = (["2Q2005", "0.0", "10000"],)
param_names = ["value"]
Expand All @@ -23,7 +23,7 @@ def time_check_datetimes(self, value):
_does_string_look_like_datetime(obj)


class ConcatDateCols(object):
class ConcatDateCols:

params = ([1234567890, "AAAA"], [1, 2])
param_names = ["value", "dim"]
Expand Down
13 changes: 13 additions & 0 deletions asv_bench/benchmarks/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,4 +240,17 @@ def time_qcut_datetime(self, bins):
pd.qcut(self.datetime_series, bins)


class Explode:
param_names = ["n_rows", "max_list_length"]
params = [[100, 1000, 10000], [3, 5, 10]]

def setup(self, n_rows, max_list_length):

data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]
self.series = pd.Series(data)

def time_explode(self, n_rows, max_list_length):
self.series.explode()


from .pandas_vb_common import setup # noqa: F401
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def time_series_datetimeindex_repr(self):
getattr(self.s, "a", None)


class All(object):
class All:

params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
param_names = ["N", "case"]
Expand All @@ -232,7 +232,7 @@ def time_all(self, N, case):
self.s.all()


class Any(object):
class Any:

params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
param_names = ["N", "case"]
Expand All @@ -245,7 +245,7 @@ def time_any(self, N, case):
self.s.any()


class NanOps(object):
class NanOps:

params = [
[
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def time_format_YYYYMMDD(self):
to_datetime(self.stringsD, format="%Y%m%d")


class ToDatetimeCacheSmallCount(object):
class ToDatetimeCacheSmallCount:

params = ([True, False], [50, 500, 5000, 100000])
param_names = ["cache", "count"]
Expand Down
2 changes: 1 addition & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ Reshaping, sorting, transposing
DataFrame.unstack
DataFrame.swapaxes
DataFrame.melt
DataFrame.explode
DataFrame.squeeze
DataFrame.to_xarray
DataFrame.T
Expand Down
2 changes: 1 addition & 1 deletion doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ Reshaping, sorting
Series.sort_index
Series.swaplevel
Series.unstack
Series.explode
Series.searchsorted
Series.ravel
Series.repeat
Expand Down Expand Up @@ -590,4 +591,3 @@ Sparse

SparseSeries.to_coo
SparseSeries.from_coo

50 changes: 50 additions & 0 deletions doc/source/user_guide/reshaping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -801,3 +801,53 @@ Note to subdivide over multiple columns we can pass in a list to the

df.pivot_table(
values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])

.. _reshaping.explode:

Exploding a list-like column
----------------------------

.. versionadded:: 0.25.0

Sometimes the values in a column are list-like.

.. ipython:: python

keys = ['panda1', 'panda2', 'panda3']
values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
df = pd.DataFrame({'keys': keys, 'values': values})
df

We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row:

.. ipython:: python

df['values'].explode()

You can also explode the column in the ``DataFrame``.

.. ipython:: python

df.explode('values')

:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other option for empty lists would be to have no entry in the result?


.. ipython:: python

s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']])
s
s.explode()

Here is a typical usecase. You have comma separated strings in a column and want to expand this.

.. ipython:: python

df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
{'var1': 'd,e,f', 'var2': 2}])
df

Creating a long form DataFrame is now straightforward using explode and chained operations

.. ipython:: python

df.assign(var1=df.var1.str.split(',')).explode('var1')
22 changes: 22 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,28 @@ The repr now looks like this:
json_normalize(data, max_level=1)


.. _whatsnew_0250.enhancements.explode:

Series.explode to split list-like values to rows
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column <reshaping.explode>` in docs for more information (:issue:`16538`, :issue:`10511`)


Here is a typical usecase. You have comma separated string in a column.

.. ipython:: python

df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
{'var1': 'd,e,f', 'var2': 2}])
df

Creating a long form ``DataFrame`` is now straightforward using chained operations

.. ipython:: python

df.assign(var1=df.var1.str.split(',')).explode('var1')

.. _whatsnew_0250.enhancements.other:

Other enhancements
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/lib.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cdef bint c_is_list_like(object, bint)
55 changes: 55 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import abc
from decimal import Decimal
from fractions import Fraction
from numbers import Number
Expand Down Expand Up @@ -886,6 +887,60 @@ def is_period(val: object) -> bool:
return util.is_period_object(val)


def is_list_like(obj: object, allow_sets: bool = True):
"""
Check if the object is list-like.

Objects that are considered list-like are for example Python
lists, tuples, sets, NumPy arrays, and Pandas Series.

Strings and datetime objects, however, are not considered list-like.

Parameters
----------
obj : The object to check
allow_sets : boolean, default True
If this parameter is False, sets will not be considered list-like

.. versionadded:: 0.24.0

Returns
-------
is_list_like : bool
Whether `obj` has list-like properties.

Examples
--------
>>> is_list_like([1, 2, 3])
True
>>> is_list_like({1, 2, 3})
True
>>> is_list_like(datetime(2017, 1, 1))
False
>>> is_list_like("foo")
False
>>> is_list_like(1)
False
>>> is_list_like(np.array([2]))
True
>>> is_list_like(np.array(2)))
False
"""
return c_is_list_like(obj, allow_sets)


cdef inline bint c_is_list_like(object obj, bint allow_sets):
return (
isinstance(obj, abc.Iterable)
# we do not count strings/unicode/bytes as list-like
and not isinstance(obj, (str, bytes))
# exclude zero-dimensional numpy arrays, effectively scalars
and not (util.is_array(obj) and obj.ndim == 0)
# exclude sets if allow_sets is False
and not (allow_sets is False and isinstance(obj, abc.Set))
)


_TYPE_MAP = {
'categorical': 'categorical',
'category': 'categorical',
Expand Down
63 changes: 61 additions & 2 deletions pandas/_libs/reshape.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ import cython
from cython import Py_ssize_t

from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
uint32_t, uint64_t, float32_t, float64_t)

uint32_t, uint64_t, float32_t, float64_t, ndarray)
cimport numpy as cnp
import numpy as np
from pandas._libs.lib cimport c_is_list_like
cnp.import_array()

ctypedef fused reshape_t:
uint8_t
Expand Down Expand Up @@ -91,3 +94,59 @@ unstack_int64 = unstack["int64_t"]
unstack_float32 = unstack["float32_t"]
unstack_float64 = unstack["float64_t"]
unstack_object = unstack["object"]


@cython.wraparound(False)
@cython.boundscheck(False)
def explode(ndarray[object] values):
"""
transform array list-likes to long form
preserve non-list entries

Parameters
----------
values : object ndarray

Returns
-------
tuple(values, counts)
"""
cdef:
Py_ssize_t i, j, count, n
object v
ndarray[object] result
ndarray[int64_t] counts

# find the resulting len
n = len(values)
counts = np.zeros(n, dtype='int64')
for i in range(n):
v = values[i]
if c_is_list_like(v, False):
if len(v):
counts[i] += len(v)
else:
# empty list-like, use a nan marker
counts[i] += 1
else:
counts[i] += 1

result = np.empty(counts.sum(), dtype='object')
count = 0
for i in range(n):
v = values[i]

if c_is_list_like(v, False):
if len(v):
for j in range(len(v)):
result[count] = v[j]
count += 1
else:
# empty list-like, use a nan marker
result[count] = np.nan
count += 1
else:
# replace with the existing scalar
result[count] = v
count += 1
return result, counts
Loading