Skip to content

Commit a0b00b8

Browse files
realeadnickleus27
authored andcommitted
[BUG] don't mangle null-objects in value_counts (pandas-dev#42743)
1 parent 085acb6 commit a0b00b8

File tree

6 files changed

+67
-18
lines changed

6 files changed

+67
-18
lines changed

asv_bench/benchmarks/series_methods.py

+24
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,18 @@ def time_value_counts(self, N, dtype):
152152
self.s.value_counts()
153153

154154

155+
class ValueCountsObjectDropNAFalse:
156+
157+
params = [10 ** 3, 10 ** 4, 10 ** 5]
158+
param_names = ["N"]
159+
160+
def setup(self, N):
161+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")
162+
163+
def time_value_counts(self, N):
164+
self.s.value_counts(dropna=False)
165+
166+
155167
class Mode:
156168

157169
params = [[10 ** 3, 10 ** 4, 10 ** 5], ["int", "uint", "float", "object"]]
@@ -164,6 +176,18 @@ def time_mode(self, N, dtype):
164176
self.s.mode()
165177

166178

179+
class ModeObjectDropNAFalse:
180+
181+
params = [10 ** 3, 10 ** 4, 10 ** 5]
182+
param_names = ["N"]
183+
184+
def setup(self, N):
185+
self.s = Series(np.random.randint(0, N, size=10 * N)).astype("object")
186+
187+
def time_mode(self, N):
188+
self.s.mode(dropna=False)
189+
190+
167191
class Dir:
168192
def setup(self):
169193
self.s = Series(index=tm.makeStringIndex(10000))

doc/source/whatsnew/v1.4.0.rst

+32
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,38 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is
240240

241241
*New behavior*:
242242

243+
.. ipython:: python
244+
245+
res
246+
247+
.. _whatsnew_140.notable_bug_fixes.value_counts_and_mode_do_not_coerse_to_nan:
248+
249+
Null-values are no longer coerced to NaN-value in value_counts and mode
250+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251+
252+
:meth:`Series.value_counts` and :meth:`Series.mode` no longer coerce ``None``, ``NaT`` and other null-values to a NaN-value for ``np.object``-dtype. This behavior is now consistent with ``unique``, ``isin`` and others (:issue:`42688`).
253+
254+
.. ipython:: python
255+
256+
s = pd.Series([True, None, pd.NaT, None, pd.NaT, None])
257+
res = s.value_counts(dropna=False)
258+
259+
Previously, all null-values were replaced by a NaN-value.
260+
261+
*Previous behavior*:
262+
263+
.. code-block:: ipython
264+
265+
In [3]: res
266+
Out[3]:
267+
NaN 5
268+
True 1
269+
dtype: int64
270+
271+
Now null-values are no longer mangled.
272+
273+
*New behavior*:
274+
243275
.. ipython:: python
244276
245277
res

pandas/_libs/hashtable_func_helper.pxi.in

+2-7
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dtypes = [('Complex128', 'complex128', 'complex128',
3131
@cython.wraparound(False)
3232
@cython.boundscheck(False)
3333
{{if dtype == 'object'}}
34-
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, navalue=np.NaN):
34+
cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna):
3535
{{else}}
3636
cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
3737
{{endif}}
@@ -42,7 +42,6 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
4242

4343
# Don't use Py_ssize_t, since table.n_buckets is unsigned
4444
khiter_t k
45-
bint is_null
4645

4746
{{c_type}} val
4847

@@ -61,11 +60,7 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna):
6160

6261
for i in range(n):
6362
val = values[i]
64-
is_null = checknull(val)
65-
if not is_null or not dropna:
66-
# all nas become the same representative:
67-
if is_null:
68-
val = navalue
63+
if not dropna or not checknull(val):
6964
k = kh_get_{{ttype}}(table, <PyObject*>val)
7065
if k != table.n_buckets:
7166
table.vals[k] += 1

pandas/tests/base/test_value_counts.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -281,5 +281,5 @@ def test_value_counts_with_nan(dropna, index_or_series):
281281
if dropna is True:
282282
expected = Series([1], index=[True])
283283
else:
284-
expected = Series([2, 1], index=[pd.NA, True])
284+
expected = Series([1, 1, 1], index=[True, pd.NA, np.nan])
285285
tm.assert_series_equal(res, expected)

pandas/tests/indexing/test_indexing.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -786,12 +786,12 @@ def test_no_reference_cycle(self):
786786
del df
787787
assert wr() is None
788788

789-
def test_label_indexing_on_nan(self):
789+
def test_label_indexing_on_nan(self, nulls_fixture):
790790
# GH 32431
791-
df = Series([1, "{1,2}", 1, None])
791+
df = Series([1, "{1,2}", 1, nulls_fixture])
792792
vc = df.value_counts(dropna=False)
793-
result1 = vc.loc[np.nan]
794-
result2 = vc[np.nan]
793+
result1 = vc.loc[nulls_fixture]
794+
result2 = vc[nulls_fixture]
795795

796796
expected = 1
797797
assert result1 == expected

pandas/tests/libs/test_hashtable.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -453,13 +453,11 @@ def test_mode_stable(self, dtype, writable):
453453

454454

455455
def test_modes_with_nans():
456-
# GH39007
457-
values = np.array([True, pd.NA, np.nan], dtype=np.object_)
458-
# pd.Na and np.nan will have the same representative: np.nan
459-
# thus we have 2 nans and 1 True
456+
# GH42688, nans aren't mangled
457+
nulls = [pd.NA, np.nan, pd.NaT, None]
458+
values = np.array([True] + nulls * 2, dtype=np.object_)
460459
modes = ht.mode(values, False)
461-
assert modes.size == 1
462-
assert np.isnan(modes[0])
460+
assert modes.size == len(nulls)
463461

464462

465463
def test_unique_label_indices_intp(writable):

0 commit comments

Comments
 (0)