Skip to content

Commit 2ca6fd8

Browse files
author
Tom Augspurger
committedSep 1, 2014
Merge pull request #8140 from TomAugspurger/df-dummies
ENH: let get_dummies take a DataFrame
2 parents 5ef3cc3 + f6a8a6d commit 2ca6fd8

File tree

4 files changed

+253
-8
lines changed

4 files changed

+253
-8
lines changed
 

‎doc/source/reshaping.rst

+43
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,49 @@ This function is often used along with discretization functions like ``cut``:
480480
481481
See also :func:`Series.str.get_dummies <pandas.core.strings.StringMethods.get_dummies>`.
482482

483+
.. versionadded:: 0.15.0
484+
485+
:func:`get_dummies` also accepts a DataFrame. By default all categorical
486+
variables (categorical in the statistical sense,
487+
those with `object` or `categorical` dtype) are encoded as dummy variables.
488+
489+
490+
.. ipython:: python
491+
492+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
493+
'C': [1, 2, 3]})
494+
pd.get_dummies(df)
495+
496+
All non-object columns are included untouched in the output.
497+
498+
You can control the columns that are encoded with the ``columns`` keyword.
499+
500+
.. ipython:: python
501+
502+
pd.get_dummies(df, columns=['A'])
503+
504+
Notice that the ``B`` column is still included in the output, it just hasn't
505+
been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't
506+
want to include it in the output.
507+
508+
As with the Series version, you can pass values for the ``prefix`` and
509+
``prefix_sep``. By default the column name is used as the prefix, and '_' as
510+
the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways
511+
512+
- string: Use the same value for ``prefix`` or ``prefix_sep`` for each column
513+
to be encoded
514+
- list: Must be the same length as the number of columns being encoded.
515+
- dict: Mapping column name to prefix
516+
517+
.. ipython:: python
518+
519+
simple = pd.get_dummies(df, prefix='new_prefix')
520+
simple
521+
from_list = pd.get_dummies(df, prefix=['from_A', 'from_B'])
522+
from_list
523+
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
524+
from_dict
525+
483526
Factorizing values
484527
------------------
485528

‎doc/source/v0.15.0.txt

+11
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,9 @@ There are no prior version deprecations that are taking effect as of 0.15.0.
418418
Deprecations
419419
~~~~~~~~~~~~
420420

421+
The ``convert_dummies`` method has been deprecated in favor of
422+
``get_dummies``(:issue:`8140`)
423+
421424
.. _whatsnew_0150.knownissues:
422425

423426
Known Issues
@@ -469,7 +472,15 @@ Enhancements
469472

470473

471474

475+
- The ``get_dummies`` method can now be used on DataFrames. By default only
476+
catagorical columns are encoded as 0's and 1's, while other columns are
477+
left untouched.
478+
479+
.. ipython:: python
472480

481+
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'],
482+
'C': [1, 2, 3]})
483+
pd.get_dummies(df)
473484

474485

475486

‎pandas/core/reshape.py

+83-6
Original file line numberDiff line numberDiff line change
@@ -979,27 +979,42 @@ def convert_dummies(data, cat_variables, prefix_sep='_'):
979979
-------
980980
dummies : DataFrame
981981
"""
982+
import warnings
983+
984+
warnings.warn("'convert_dummies' is deprecated and will be removed "
985+
"in a future release. Use 'get_dummies' instead.",
986+
FutureWarning)
987+
982988
result = data.drop(cat_variables, axis=1)
983989
for variable in cat_variables:
984-
dummies = get_dummies(data[variable], prefix=variable,
985-
prefix_sep=prefix_sep)
990+
dummies = _get_dummies_1d(data[variable], prefix=variable,
991+
prefix_sep=prefix_sep)
986992
result = result.join(dummies)
987993
return result
988994

989995

990-
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
996+
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
997+
columns=None):
991998
"""
992999
Convert categorical variable into dummy/indicator variables
9931000
9941001
Parameters
9951002
----------
996-
data : array-like or Series
997-
prefix : string, default None
1003+
data : array-like, Series, or DataFrame
1004+
prefix : string, list of strings, or dict of strings, default None
9981005
String to append DataFrame column names
1006+
Pass a list with length equal to the number of columns
1007+
when calling get_dummies on a DataFrame. Alternativly, `prefix`
1008+
can be a dictionary mapping column names to prefixes.
9991009
prefix_sep : string, default '_'
1000-
If appending prefix, separator/delimiter to use
1010+
If appending prefix, separator/delimiter to use. Or pass a
1011+
list or dictionary as with `prefix.`
10011012
dummy_na : bool, default False
10021013
Add a column to indicate NaNs, if False NaNs are ignored.
1014+
columns : list-like, default None
1015+
Column names in the DataFrame to be encoded.
1016+
If `columns` is None then all the columns with
1017+
`object` or `category` dtype will be converted.
10031018
10041019
Returns
10051020
-------
@@ -1031,9 +1046,71 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
10311046
1 0 1 0
10321047
2 0 0 1
10331048
1049+
>>> df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
1050+
'C': [1, 2, 3]})
1051+
1052+
>>> get_dummies(df, prefix=['col1', 'col2']):
1053+
C col1_a col1_b col2_a col2_b col2_c
1054+
0 1 1 0 0 1 0
1055+
1 2 0 1 1 0 0
1056+
2 3 1 0 0 0 1
1057+
10341058
See also ``Series.str.get_dummies``.
10351059
10361060
"""
1061+
from pandas.tools.merge import concat
1062+
from itertools import cycle
1063+
1064+
if isinstance(data, DataFrame):
1065+
# determine columns being encoded
1066+
1067+
if columns is None:
1068+
columns_to_encode = data.select_dtypes(include=['object',
1069+
'category']).columns
1070+
else:
1071+
columns_to_encode = columns
1072+
1073+
# validate prefixes and separator to avoid silently dropping cols
1074+
def check_len(item, name):
1075+
length_msg = ("Length of '{0}' ({1}) did "
1076+
"not match the length of the columns "
1077+
"being encoded ({2}).")
1078+
1079+
if com.is_list_like(item):
1080+
if not len(item) == len(columns_to_encode):
1081+
raise ValueError(length_msg.format(name, len(item),
1082+
len(columns_to_encode)))
1083+
1084+
check_len(prefix, 'prefix')
1085+
check_len(prefix_sep, 'prefix_sep')
1086+
if isinstance(prefix, compat.string_types):
1087+
prefix = cycle([prefix])
1088+
if isinstance(prefix, dict):
1089+
prefix = [prefix[col] for col in columns_to_encode]
1090+
1091+
if prefix is None:
1092+
prefix = columns_to_encode
1093+
1094+
# validate separators
1095+
if isinstance(prefix_sep, compat.string_types):
1096+
prefix_sep = cycle([prefix_sep])
1097+
elif isinstance(prefix_sep, dict):
1098+
prefix_sep = [prefix_sep[col] for col in columns_to_encode]
1099+
1100+
result = data.drop(columns_to_encode, axis=1)
1101+
with_dummies = [result]
1102+
for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep):
1103+
1104+
dummy = _get_dummies_1d(data[col], prefix=pre,
1105+
prefix_sep=sep, dummy_na=dummy_na)
1106+
with_dummies.append(dummy)
1107+
result = concat(with_dummies, axis=1)
1108+
else:
1109+
result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na)
1110+
return result
1111+
1112+
1113+
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False):
10371114
# Series avoids inconsistent NaN handling
10381115
cat = Categorical.from_array(Series(data))
10391116
levels = cat.levels

‎pandas/tests/test_reshape.py

+116-2
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ def test_multiindex(self):
149149

150150

151151
class TestGetDummies(tm.TestCase):
152+
153+
def setUp(self):
154+
self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
155+
'C': [1, 2, 3]})
156+
152157
def test_basic(self):
153158
s_list = list('abc')
154159
s_series = Series(s_list)
@@ -209,6 +214,114 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values
209214
u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}})
210215
assert_frame_equal(res, exp)
211216

217+
def test_dataframe_dummies_all_obj(self):
218+
df = self.df[['A', 'B']]
219+
result = get_dummies(df)
220+
expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0],
221+
'B_b': [1., 1, 0], 'B_c': [0., 0, 1]})
222+
assert_frame_equal(result, expected)
223+
224+
def test_dataframe_dummies_mix_default(self):
225+
df = self.df
226+
result = get_dummies(df)
227+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
228+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
229+
'B_c': [0., 0, 1]})
230+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
231+
assert_frame_equal(result, expected)
232+
233+
def test_dataframe_dummies_prefix_list(self):
234+
prefixes = ['from_A', 'from_B']
235+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
236+
'C': [1, 2, 3]})
237+
result = get_dummies(df, prefix=prefixes)
238+
expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1],
239+
'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0],
240+
'from_B_c': [0., 0, 1]})
241+
expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b',
242+
'from_B_c']]
243+
assert_frame_equal(result, expected)
244+
245+
def test_datafrmae_dummies_prefix_str(self):
246+
# not that you should do this...
247+
df = self.df
248+
result = get_dummies(df, prefix='bad')
249+
expected = DataFrame([[1, 1., 0., 1., 0.],
250+
[2, 0., 1., 1., 0.],
251+
[3, 1., 0., 0., 1.]],
252+
columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'])
253+
assert_frame_equal(result, expected)
254+
255+
def test_dataframe_dummies_subset(self):
256+
df = self.df
257+
result = get_dummies(df, prefix=['from_A'],
258+
columns=['A'])
259+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
260+
'B': ['b', 'b', 'c'], 'C': [1, 2, 3]})
261+
assert_frame_equal(result, expected)
262+
263+
def test_dataframe_dummies_prefix_sep(self):
264+
df = self.df
265+
result = get_dummies(df, prefix_sep='..')
266+
expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1],
267+
'A..b': [0., 1, 0], 'B..b': [1., 1, 0],
268+
'B..c': [0., 0, 1]})
269+
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
270+
assert_frame_equal(result, expected)
271+
272+
result = get_dummies(df, prefix_sep=['..', '__'])
273+
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
274+
assert_frame_equal(result, expected)
275+
276+
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'})
277+
assert_frame_equal(result, expected)
278+
279+
def test_dataframe_dummies_prefix_bad_length(self):
280+
with tm.assertRaises(ValueError):
281+
get_dummies(self.df, prefix=['too few'])
282+
283+
def test_dataframe_dummies_prefix_sep_bad_length(self):
284+
with tm.assertRaises(ValueError):
285+
get_dummies(self.df, prefix_sep=['bad'])
286+
287+
def test_dataframe_dummies_prefix_dict(self):
288+
prefixes = {'A': 'from_A', 'B': 'from_B'}
289+
df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'],
290+
'C': [1, 2, 3]})
291+
result = get_dummies(df, prefix=prefixes)
292+
expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0],
293+
'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1],
294+
'C': [1, 2, 3]})
295+
assert_frame_equal(result, expected)
296+
297+
def test_dataframe_dummies_with_na(self):
298+
df = self.df
299+
df.loc[3, :] = [np.nan, np.nan, np.nan]
300+
result = get_dummies(df, dummy_na=True)
301+
expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0],
302+
'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0],
303+
'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]})
304+
expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c',
305+
'B_nan']]
306+
assert_frame_equal(result, expected)
307+
308+
result = get_dummies(df, dummy_na=False)
309+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
310+
assert_frame_equal(result, expected)
311+
312+
def test_dataframe_dummies_with_categorical(self):
313+
df = self.df
314+
df['cat'] = pd.Categorical(['x', 'y', 'y'])
315+
result = get_dummies(df)
316+
expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1],
317+
'A_b': [0., 1, 0], 'B_b': [1., 1, 0],
318+
'B_c': [0., 0, 1], 'cat_x': [1., 0, 0],
319+
'cat_y': [0., 1, 1]})
320+
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c',
321+
'cat_x', 'cat_y']]
322+
assert_frame_equal(result, expected)
323+
324+
212325
class TestConvertDummies(tm.TestCase):
213326
def test_convert_dummies(self):
214327
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
@@ -218,8 +331,9 @@ def test_convert_dummies(self):
218331
'C': np.random.randn(8),
219332
'D': np.random.randn(8)})
220333

221-
result = convert_dummies(df, ['A', 'B'])
222-
result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.')
334+
with tm.assert_produces_warning(FutureWarning):
335+
result = convert_dummies(df, ['A', 'B'])
336+
result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.')
223337

224338
expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1],
225339
'A_bar': [0, 1, 0, 1, 0, 1, 0, 0],

0 commit comments

Comments
 (0)