Skip to content

Commit 0904a55

Browse files
committed
BUG: Coerce to numeric despite uint64 conflict
Closes pandas-devgh-17007. Closes pandas-devgh-17125.
1 parent 9f0ee53 commit 0904a55

File tree

4 files changed

+67
-53
lines changed

4 files changed

+67
-53
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,7 @@ Conversion
827827
- Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`)
828828
- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`)
829829
- Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`).
830+
- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`)
830831

831832
Indexing
832833
^^^^^^^^

pandas/_libs/src/inference.pyx

+12-15
Original file line numberDiff line numberDiff line change
@@ -165,20 +165,8 @@ cdef class Seen(object):
165165
two conflict cases was also detected. However, we are
166166
trying to force conversion to a numeric dtype.
167167
"""
168-
if self.uint_ and (self.null_ or self.sint_):
169-
if not self.coerce_numeric:
170-
return True
171-
172-
if self.null_:
173-
msg = ("uint64 array detected, and such an "
174-
"array cannot contain NaN.")
175-
else: # self.sint_ = 1
176-
msg = ("uint64 and negative values detected. "
177-
"Cannot safely return a numeric array "
178-
"without truncating data.")
179-
180-
raise ValueError(msg)
181-
return False
168+
return (self.uint_ and (self.null_ or self.sint_)
169+
and not self.coerce_numeric)
182170

183171
cdef inline saw_null(self):
184172
"""
@@ -1103,10 +1091,17 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
11031091
seen.saw_int(val)
11041092

11051093
if val >= 0:
1106-
uints[i] = val
1094+
if val <= oUINT64_MAX:
1095+
uints[i] = val
1096+
else:
1097+
seen.float_ = True
11071098

11081099
if val <= oINT64_MAX:
11091100
ints[i] = val
1101+
1102+
if seen.sint_ and seen.uint_:
1103+
seen.float_ = True
1104+
11101105
elif util.is_bool_object(val):
11111106
floats[i] = uints[i] = ints[i] = bools[i] = val
11121107
seen.bool_ = True
@@ -1154,6 +1149,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
11541149
uints[i] = as_int
11551150
if as_int <= oINT64_MAX:
11561151
ints[i] = as_int
1152+
1153+
seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
11571154
else:
11581155
seen.float_ = True
11591156
except (TypeError, ValueError) as e:

pandas/tests/dtypes/test_inference.py

+37-38
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939
from pandas.util import testing as tm
4040

4141

42+
@pytest.fixture(params=[True, False], ids=lambda val: str(val))
43+
def coerce(request):
44+
return request.param
45+
46+
4247
def test_is_sequence():
4348
is_seq = inference.is_sequence
4449
assert (is_seq((1, 2)))
@@ -340,44 +345,38 @@ def test_convert_numeric_uint64(self):
340345
exp = np.array([2**63], dtype=np.uint64)
341346
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
342347

343-
def test_convert_numeric_uint64_nan(self):
344-
msg = 'uint64 array detected'
345-
cases = [(np.array([2**63, np.nan], dtype=object), set()),
346-
(np.array([str(2**63), np.nan], dtype=object), set()),
347-
(np.array([np.nan, 2**63], dtype=object), set()),
348-
(np.array([np.nan, str(2**63)], dtype=object), set()),
349-
(np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
350-
(np.array([str(2**63), str(2**63 + 1)],
351-
dtype=object), set([2**63]))]
352-
353-
for coerce in (True, False):
354-
for arr, na_values in cases:
355-
if coerce:
356-
with tm.assert_raises_regex(ValueError, msg):
357-
lib.maybe_convert_numeric(arr, na_values,
358-
coerce_numeric=coerce)
359-
else:
360-
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
361-
arr, na_values), arr)
362-
363-
def test_convert_numeric_int64_uint64(self):
364-
msg = 'uint64 and negative values detected'
365-
cases = [np.array([2**63, -1], dtype=object),
366-
np.array([str(2**63), -1], dtype=object),
367-
np.array([str(2**63), str(-1)], dtype=object),
368-
np.array([-1, 2**63], dtype=object),
369-
np.array([-1, str(2**63)], dtype=object),
370-
np.array([str(-1), str(2**63)], dtype=object)]
371-
372-
for coerce in (True, False):
373-
for case in cases:
374-
if coerce:
375-
with tm.assert_raises_regex(ValueError, msg):
376-
lib.maybe_convert_numeric(case, set(),
377-
coerce_numeric=coerce)
378-
else:
379-
tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
380-
case, set()), case)
348+
@pytest.mark.parametrize("arr,na_values", [
349+
(np.array([2**63, np.nan], dtype=object), set()),
350+
(np.array([str(2**63), np.nan], dtype=object), set()),
351+
(np.array([np.nan, 2**63], dtype=object), set()),
352+
(np.array([np.nan, str(2**63)], dtype=object), set())])
353+
def test_convert_numeric_uint64_nan(self, coerce, arr, na_values):
354+
expected = arr.astype(float) if coerce else arr.copy()
355+
result = lib.maybe_convert_numeric(arr, na_values,
356+
coerce_numeric=coerce)
357+
tm.assert_almost_equal(result, expected)
358+
359+
def test_convert_numeric_uint64_nan_values(self, coerce):
360+
arr = np.array([2**63, 2**63 + 1], dtype=object)
361+
na_values = set([2**63])
362+
363+
expected = (np.array([np.nan, 2**63 + 1], dtype=float)
364+
if coerce else arr.copy())
365+
result = lib.maybe_convert_numeric(arr, na_values,
366+
coerce_numeric=coerce)
367+
tm.assert_almost_equal(result, expected)
368+
369+
@pytest.mark.parametrize("case", [
370+
np.array([2**63, -1], dtype=object),
371+
np.array([str(2**63), -1], dtype=object),
372+
np.array([str(2**63), str(-1)], dtype=object),
373+
np.array([-1, 2**63], dtype=object),
374+
np.array([-1, str(2**63)], dtype=object),
375+
np.array([str(-1), str(2**63)], dtype=object)])
376+
def test_convert_numeric_int64_uint64(self, case, coerce):
377+
expected = case.astype(float) if coerce else case.copy()
378+
result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
379+
tm.assert_almost_equal(result, expected)
381380

382381
def test_maybe_convert_objects_uint64(self):
383382
# see gh-4471

pandas/tests/tools/test_numeric.py

+17
Original file line numberDiff line numberDiff line change
@@ -381,3 +381,20 @@ def test_downcast_limits(self):
381381
for dtype, downcast, min_max in dtype_downcast_min_max:
382382
series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
383383
assert series.dtype == dtype
384+
385+
def test_coerce_uint64_conflict(self):
386+
# see gh-17007 and gh-17125
387+
#
388+
# Still returns float despite the uint64-nan conflict,
389+
# which would normally force the casting to object.
390+
df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
391+
expected = pd.Series([200, 300, np.nan, np.nan,
392+
30000000000000000000], dtype=float, name="a")
393+
result = to_numeric(df["a"], errors="coerce")
394+
tm.assert_series_equal(expected, result)
395+
396+
s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
397+
expected = pd.Series([12345678901234567890,
398+
1234567890, np.nan], dtype=float)
399+
result = to_numeric(s, errors="coerce")
400+
tm.assert_series_equal(expected, result)

0 commit comments

Comments
 (0)