pandas-dev · jreback · Jul 18, 2019 · Dec 20, 2018 · Jul 6, 2019 · Jul 6, 2019
diff --git a/Makefile b/Makefile
@@ -1,5 +1,7 @@
 .PHONY : develop build clean clean_pyc doc lint-diff black
 
+all: develop
+
 clean:
 	-python setup.py clean
 

diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -10,7 +10,7 @@
     pass
 
 
-class DoesStringLookLikeDatetime(object):
+class DoesStringLookLikeDatetime:
 
     params = (["2Q2005", "0.0", "10000"],)
     param_names = ["value"]
@@ -23,7 +23,7 @@ def time_check_datetimes(self, value):
             _does_string_look_like_datetime(obj)
 
 
-class ConcatDateCols(object):
+class ConcatDateCols:
 
     params = ([1234567890, "AAAA"], [1, 2])
     param_names = ["value", "dim"]

diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -240,4 +240,17 @@ def time_qcut_datetime(self, bins):
         pd.qcut(self.datetime_series, bins)
 
 
+class Explode:
+    param_names = ["n_rows", "max_list_length"]
+    params = [[100, 1000, 10000], [3, 5, 10]]
+
+    def setup(self, n_rows, max_list_length):
+
+        data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)]
+        self.series = pd.Series(data)
+
+    def time_explode(self, n_rows, max_list_length):
+        self.series.explode()
+
+
 from .pandas_vb_common import setup  # noqa: F401
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -219,7 +219,7 @@ def time_series_datetimeindex_repr(self):
         getattr(self.s, "a", None)
 
 
-class All(object):
+class All:
 
     params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
     param_names = ["N", "case"]
@@ -232,7 +232,7 @@ def time_all(self, N, case):
         self.s.all()
 
 
-class Any(object):
+class Any:
 
     params = [[10 ** 3, 10 ** 6], ["fast", "slow"]]
     param_names = ["N", "case"]
@@ -245,7 +245,7 @@ def time_any(self, N, case):
         self.s.any()
 
 
-class NanOps(object):
+class NanOps:
 
     params = [
         [

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -293,7 +293,7 @@ def time_format_YYYYMMDD(self):
         to_datetime(self.stringsD, format="%Y%m%d")
 
 
-class ToDatetimeCacheSmallCount(object):
+class ToDatetimeCacheSmallCount:
 
     params = ([True, False], [50, 500, 5000, 100000])
     param_names = ["cache", "count"]

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -156,7 +156,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Check for python2 new-style classes and for empty parentheses' ; echo $MSG
-    invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas scripts
+    invgrep -R --include="*.py" --include="*.pyx" -E "class\s\S*\((object)?\):" pandas asv_bench/benchmarks scripts
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -239,6 +239,7 @@ Reshaping, sorting, transposing
    DataFrame.unstack
    DataFrame.swapaxes
    DataFrame.melt
+   DataFrame.explode
    DataFrame.squeeze
    DataFrame.to_xarray
    DataFrame.T

diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst
@@ -245,6 +245,7 @@ Reshaping, sorting
    Series.sort_index
    Series.swaplevel
    Series.unstack
+   Series.explode
    Series.searchsorted
    Series.ravel
    Series.repeat
@@ -590,4 +591,3 @@ Sparse
 
    SparseSeries.to_coo
    SparseSeries.from_coo
-
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -801,3 +801,53 @@ Note to subdivide over multiple columns we can pass in a list to the
 
    df.pivot_table(
        values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean'])
+
+.. _reshaping.explode:
+
+Exploding a list-like column
+----------------------------
+
+.. versionadded:: 0.25.0
+
+Sometimes the values in a column are list-like.
+
+.. ipython:: python
+
+   keys = ['panda1', 'panda2', 'panda3']
+   values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']]
+   df = pd.DataFrame({'keys': keys, 'values': values})
+   df
+
+We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row:
+
+.. ipython:: python
+
+   df['values'].explode()
+
+You can also explode the column in the ``DataFrame``.
+
+.. ipython:: python
+
+   df.explode('values')
+
+:meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``.
+
+.. ipython:: python
+
+   s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']])
+   s
+   s.explode()
+
+Here is a typical usecase. You have comma separated strings in a column and want to expand this.
+
+.. ipython:: python
+
+    df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
+                       {'var1': 'd,e,f', 'var2': 2}])
+    df
+
+Creating a long form DataFrame is now straightforward using explode and chained operations
+
+.. ipython:: python
+
+   df.assign(var1=df.var1.str.split(',')).explode('var1')
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -182,6 +182,28 @@ The repr now looks like this:
     json_normalize(data, max_level=1)
 
 
+.. _whatsnew_0250.enhancements.explode:
+
+Series.explode to split list-like values to rows
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`Series` and :class:`DataFrame` have gained the :meth:`DataFrame.explode` methods to transform list-likes to individual rows. See :ref:`section on Exploding list-like column <reshaping.explode>` in docs for more information (:issue:`16538`, :issue:`10511`)
+
+
+Here is a typical usecase. You have comma separated string in a column.
+
+.. ipython:: python
+
+    df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1},
+                       {'var1': 'd,e,f', 'var2': 2}])
+    df
+
+Creating a long form ``DataFrame`` is now straightforward using chained operations
+
+.. ipython:: python
+
+    df.assign(var1=df.var1.str.split(',')).explode('var1')
+
 .. _whatsnew_0250.enhancements.other:
 
 Other enhancements

diff --git a/pandas/_libs/lib.pxd b/pandas/_libs/lib.pxd
@@ -0,0 +1 @@
+cdef bint c_is_list_like(object, bint)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1,3 +1,4 @@
+from collections import abc
 from decimal import Decimal
 from fractions import Fraction
 from numbers import Number
@@ -886,6 +887,60 @@ def is_period(val: object) -> bool:
     return util.is_period_object(val)
 
 
+def is_list_like(obj: object, allow_sets: bool = True):
+    """
+    Check if the object is list-like.
+
+    Objects that are considered list-like are for example Python
+    lists, tuples, sets, NumPy arrays, and Pandas Series.
+
+    Strings and datetime objects, however, are not considered list-like.
+
+    Parameters
+    ----------
+    obj : The object to check
+    allow_sets : boolean, default True
+        If this parameter is False, sets will not be considered list-like
+
+        .. versionadded:: 0.24.0
+
+    Returns
+    -------
+    is_list_like : bool
+        Whether `obj` has list-like properties.
+
+    Examples
+    --------
+    >>> is_list_like([1, 2, 3])
+    True
+    >>> is_list_like({1, 2, 3})
+    True
+    >>> is_list_like(datetime(2017, 1, 1))
+    False
+    >>> is_list_like("foo")
+    False
+    >>> is_list_like(1)
+    False
+    >>> is_list_like(np.array([2]))
+    True
+    >>> is_list_like(np.array(2)))
+    False
+    """
+    return c_is_list_like(obj, allow_sets)
+
+
+cdef inline bint c_is_list_like(object obj, bint allow_sets):
+    return (
+        isinstance(obj, abc.Iterable)
+        # we do not count strings/unicode/bytes as list-like
+        and not isinstance(obj, (str, bytes))
+        # exclude zero-dimensional numpy arrays, effectively scalars
+        and not (util.is_array(obj) and obj.ndim == 0)
+        # exclude sets if allow_sets is False
+        and not (allow_sets is False and isinstance(obj, abc.Set))
+    )
+
+
 _TYPE_MAP = {
     'categorical': 'categorical',
     'category': 'categorical',

diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx
@@ -2,8 +2,11 @@ import cython
 from cython import Py_ssize_t
 
 from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
-                    uint32_t, uint64_t, float32_t, float64_t)
-
+                    uint32_t, uint64_t, float32_t, float64_t, ndarray)
+cimport numpy as cnp
+import numpy as np
+from pandas._libs.lib cimport c_is_list_like
+cnp.import_array()
 
 ctypedef fused reshape_t:
     uint8_t
@@ -91,3 +94,59 @@ unstack_int64 = unstack["int64_t"]
 unstack_float32 = unstack["float32_t"]
 unstack_float64 = unstack["float64_t"]
 unstack_object = unstack["object"]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def explode(ndarray[object] values):
+    """
+    transform array list-likes to long form
+    preserve non-list entries
+
+    Parameters
+    ----------
+    values : object ndarray
+
+    Returns
+    -------
+    tuple(values, counts)
+    """
+    cdef:
+        Py_ssize_t i, j, count, n
+        object v
+        ndarray[object] result
+        ndarray[int64_t] counts
+
+    # find the resulting len
+    n = len(values)
+    counts = np.zeros(n, dtype='int64')
+    for i in range(n):
+        v = values[i]
+        if c_is_list_like(v, False):
+            if len(v):
+                counts[i] += len(v)
+            else:
+                # empty list-like, use a nan marker
+                counts[i] += 1
+        else:
+            counts[i] += 1
+
+    result = np.empty(counts.sum(), dtype='object')
+    count = 0
+    for i in range(n):
+        v = values[i]
+
+        if c_is_list_like(v, False):
+            if len(v):
+                for j in range(len(v)):
+                    result[count] = v[j]
+                    count += 1
+            else:
+                # empty list-like, use a nan marker
+                result[count] = np.nan
+                count += 1
+        else:
+            # replace with the existing scalar
+            result[count] = v
+            count += 1
+    return result, counts