BUG: Coerce to numeric despite uint64 conflict

gfyoung · gfyoung · commit 0904a559d43a · 2017-10-08T22:58:45.000-07:00
Closes pandas-devgh-17007. Closes pandas-devgh-17125.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -827,6 +827,7 @@ Conversion
 - Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`)
 - Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`)
 - Bug in :meth:`~DataFrame.astype` converting to object dtype when passed extension type classes (`DatetimeTZDtype``, ``CategoricalDtype``) rather than instances. Now a ``TypeError`` is raised when a class is passed (:issue:`17780`).
+- Bug in :meth:`to_numeric` in which elements were not always being coerced to numeric when ``errors='coerce'`` (:issue:`17007`, :issue:`17125`)
 
 Indexing
 ^^^^^^^^
diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
@@ -165,20 +165,8 @@ cdef class Seen(object):
                      two conflict cases was also detected. However, we are
                      trying to force conversion to a numeric dtype.
         """
-        if self.uint_ and (self.null_ or self.sint_):
-            if not self.coerce_numeric:
-                return True
-
-            if self.null_:
-                msg = ("uint64 array detected, and such an "
-                       "array cannot contain NaN.")
-            else:  # self.sint_ = 1
-                msg = ("uint64 and negative values detected. "
-                       "Cannot safely return a numeric array "
-                       "without truncating data.")
-
-            raise ValueError(msg)
-        return False
+        return (self.uint_ and (self.null_ or self.sint_)
+                and not self.coerce_numeric)
 
     cdef inline saw_null(self):
         """
@@ -1103,10 +1091,17 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
             seen.saw_int(val)
 
             if val >= 0:
-                uints[i] = val
+                if val <= oUINT64_MAX:
+                    uints[i] = val
+                else:
+                    seen.float_ = True
 
             if val <= oINT64_MAX:
                 ints[i] = val
+
+            if seen.sint_ and seen.uint_:
+                seen.float_ = True
+
         elif util.is_bool_object(val):
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
@@ -1154,6 +1149,8 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
                             uints[i] = as_int
                         if as_int <= oINT64_MAX:
                             ints[i] = as_int
+
+                    seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
                 else:
                     seen.float_ = True
             except (TypeError, ValueError) as e:
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -39,6 +39,11 @@
 from pandas.util import testing as tm
 
 
+@pytest.fixture(params=[True, False], ids=lambda val: str(val))
+def coerce(request):
+    return request.param
+
+
 def test_is_sequence():
     is_seq = inference.is_sequence
     assert (is_seq((1, 2)))
@@ -340,44 +345,38 @@ def test_convert_numeric_uint64(self):
         exp = np.array([2**63], dtype=np.uint64)
         tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set()), exp)
 
-    def test_convert_numeric_uint64_nan(self):
-        msg = 'uint64 array detected'
-        cases = [(np.array([2**63, np.nan], dtype=object), set()),
-                 (np.array([str(2**63), np.nan], dtype=object), set()),
-                 (np.array([np.nan, 2**63], dtype=object), set()),
-                 (np.array([np.nan, str(2**63)], dtype=object), set()),
-                 (np.array([2**63, 2**63 + 1], dtype=object), set([2**63])),
-                 (np.array([str(2**63), str(2**63 + 1)],
-                           dtype=object), set([2**63]))]
-
-        for coerce in (True, False):
-            for arr, na_values in cases:
-                if coerce:
-                    with tm.assert_raises_regex(ValueError, msg):
-                        lib.maybe_convert_numeric(arr, na_values,
-                                                  coerce_numeric=coerce)
-                else:
-                    tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
-                        arr, na_values), arr)
-
-    def test_convert_numeric_int64_uint64(self):
-        msg = 'uint64 and negative values detected'
-        cases = [np.array([2**63, -1], dtype=object),
-                 np.array([str(2**63), -1], dtype=object),
-                 np.array([str(2**63), str(-1)], dtype=object),
-                 np.array([-1, 2**63], dtype=object),
-                 np.array([-1, str(2**63)], dtype=object),
-                 np.array([str(-1), str(2**63)], dtype=object)]
-
-        for coerce in (True, False):
-            for case in cases:
-                if coerce:
-                    with tm.assert_raises_regex(ValueError, msg):
-                        lib.maybe_convert_numeric(case, set(),
-                                                  coerce_numeric=coerce)
-                else:
-                    tm.assert_numpy_array_equal(lib.maybe_convert_numeric(
-                        case, set()), case)
+    @pytest.mark.parametrize("arr,na_values", [
+        (np.array([2**63, np.nan], dtype=object), set()),
+        (np.array([str(2**63), np.nan], dtype=object), set()),
+        (np.array([np.nan, 2**63], dtype=object), set()),
+        (np.array([np.nan, str(2**63)], dtype=object), set())])
+    def test_convert_numeric_uint64_nan(self, coerce, arr, na_values):
+        expected = arr.astype(float) if coerce else arr.copy()
+        result = lib.maybe_convert_numeric(arr, na_values,
+                                           coerce_numeric=coerce)
+        tm.assert_almost_equal(result, expected)
+
+    def test_convert_numeric_uint64_nan_values(self, coerce):
+        arr = np.array([2**63, 2**63 + 1], dtype=object)
+        na_values = set([2**63])
+
+        expected = (np.array([np.nan, 2**63 + 1], dtype=float)
+                    if coerce else arr.copy())
+        result = lib.maybe_convert_numeric(arr, na_values,
+                                           coerce_numeric=coerce)
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("case", [
+        np.array([2**63, -1], dtype=object),
+        np.array([str(2**63), -1], dtype=object),
+        np.array([str(2**63), str(-1)], dtype=object),
+        np.array([-1, 2**63], dtype=object),
+        np.array([-1, str(2**63)], dtype=object),
+        np.array([str(-1), str(2**63)], dtype=object)])
+    def test_convert_numeric_int64_uint64(self, case, coerce):
+        expected = case.astype(float) if coerce else case.copy()
+        result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
+        tm.assert_almost_equal(result, expected)
 
     def test_maybe_convert_objects_uint64(self):
         # see gh-4471
diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py
@@ -381,3 +381,20 @@ def test_downcast_limits(self):
         for dtype, downcast, min_max in dtype_downcast_min_max:
             series = pd.to_numeric(pd.Series(min_max), downcast=downcast)
             assert series.dtype == dtype
+
+    def test_coerce_uint64_conflict(self):
+        # see gh-17007 and gh-17125
+        #
+        # Still returns float despite the uint64-nan conflict,
+        # which would normally force the casting to object.
+        df = pd.DataFrame({"a": [200, 300, "", "NaN", 30000000000000000000]})
+        expected = pd.Series([200, 300, np.nan, np.nan,
+                              30000000000000000000], dtype=float, name="a")
+        result = to_numeric(df["a"], errors="coerce")
+        tm.assert_series_equal(expected, result)
+
+        s = pd.Series(["12345678901234567890", "1234567890", "ITEM"])
+        expected = pd.Series([12345678901234567890,
+                              1234567890, np.nan], dtype=float)
+        result = to_numeric(s, errors="coerce")
+        tm.assert_series_equal(expected, result)