Skip to content

Commit aacd05a

Browse files
committed
BUG: Better handle larger numbers in to_numeric
* Warn about lossiness when passing really large numbers that exceed (u)int64 ranges. * Coerce negative numbers to float when requested instead of crashing and returning object. * Consistently parse numbers as integers / floats, even if we know that the resulting container has to be float. This is to ensure consistent error behavior when inputs numbers are too large. Closes gh-24910.
1 parent 95f8dca commit aacd05a

File tree

4 files changed

+145
-15
lines changed

4 files changed

+145
-15
lines changed

doc/source/whatsnew/v0.25.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ Timezones
103103
Numeric
104104
^^^^^^^
105105

106+
- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
107+
- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
106108
-
107109
-
108110
-

pandas/_libs/lib.pyx

+15-10
Original file line numberDiff line numberDiff line change
@@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18281828
except (ValueError, OverflowError, TypeError):
18291829
pass
18301830

1831-
# otherwise, iterate and do full infererence
1831+
# Otherwise, iterate and do full inference.
18321832
cdef:
18331833
int status, maybe_int
18341834
Py_ssize_t i, n = values.size
@@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
18651865
else:
18661866
seen.float_ = True
18671867

1868-
if val <= oINT64_MAX:
1868+
if oINT64_MIN <= val <= oINT64_MAX:
18691869
ints[i] = val
18701870

1871-
if seen.sint_ and seen.uint_:
1871+
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
18721872
seen.float_ = True
18731873

18741874
elif util.is_bool_object(val):
@@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
19101910
else:
19111911
seen.saw_int(as_int)
19121912

1913-
if not (seen.float_ or as_int in na_values):
1913+
if as_int not in na_values:
19141914
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
1915-
raise ValueError('Integer out of range.')
1915+
if seen.coerce_numeric:
1916+
seen.float_ = True
1917+
else:
1918+
raise ValueError("Integer out of range.")
1919+
else:
1920+
if as_int >= 0:
1921+
uints[i] = as_int
19161922

1917-
if as_int >= 0:
1918-
uints[i] = as_int
1919-
if as_int <= oINT64_MAX:
1920-
ints[i] = as_int
1923+
if as_int <= oINT64_MAX:
1924+
ints[i] = as_int
19211925

19221926
seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
19231927
else:
19241928
seen.float_ = True
19251929
except (TypeError, ValueError) as e:
19261930
if not seen.coerce_numeric:
1927-
raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
1931+
raise type(e)(str(e) + " at position {pos}".format(pos=i))
19281932
elif "uint64" in str(e): # Exception from check functions.
19291933
raise
1934+
19301935
seen.saw_null()
19311936
floats[i] = NaN
19321937

pandas/core/tools/numeric.py

+8
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
1919
depending on the data supplied. Use the `downcast` parameter
2020
to obtain other dtypes.
2121
22+
Please note that precision loss may occur if really large numbers
23+
are passed in. Due to the internal limitations of `ndarray`, if
24+
numbers smaller than `-9223372036854775808` or larger than
25+
`18446744073709551615` are passed in, it is very likely they
26+
will be converted to float so that they can stored in an `ndarray`.
27+
These warnings apply similarly to `Series` since it internally
28+
leverages `ndarray`.
29+
2230
Parameters
2331
----------
2432
arg : scalar, list, tuple, 1-d array, or Series

pandas/tests/tools/test_numeric.py

+120-5
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,11 @@ def test_all_nan():
172172
tm.assert_series_equal(result, expected)
173173

174174

175-
@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
175+
@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
176+
def errors(request):
177+
return request.param
178+
179+
176180
def test_type_check(errors):
177181
# see gh-11776
178182
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
@@ -183,11 +187,122 @@ def test_type_check(errors):
183187
to_numeric(df, **kwargs)
184188

185189

186-
@pytest.mark.parametrize("val", [
187-
1, 1.1, "1", "1.1", -1.5, "-1.5"
190+
@pytest.fixture(params=[True, False])
191+
def signed(request):
192+
return request.param
193+
194+
195+
@pytest.fixture(params=[lambda x: x, str])
196+
def transform(request):
197+
return request.param
198+
199+
200+
@pytest.mark.parametrize("val", [1, 1.1, 20001])
201+
def test_scalar(val, signed, transform):
202+
val = -val if signed else val
203+
assert to_numeric(transform(val)) == float(val)
204+
205+
206+
@pytest.fixture(params=[
207+
47393996303418497800,
208+
100000000000000000000
188209
])
189-
def test_scalar(val):
190-
assert to_numeric(val) == float(val)
210+
def large_val(request):
211+
return request.param
212+
213+
214+
def test_really_large_scalar(large_val, signed, transform, errors):
215+
# see gh-24910
216+
kwargs = dict(errors=errors) if errors is not None else dict()
217+
val = -large_val if signed else large_val
218+
219+
val = transform(val)
220+
val_is_string = isinstance(val, str)
221+
222+
if val_is_string and errors in (None, "raise"):
223+
msg = "Integer out of range. at position 0"
224+
with pytest.raises(ValueError, match=msg):
225+
to_numeric(val, **kwargs)
226+
else:
227+
expected = float(val) if (errors == "coerce" and
228+
val_is_string) else val
229+
assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
230+
231+
232+
@pytest.fixture(params=[True, False])
233+
def multiple_elts(request):
234+
return request.param
235+
236+
237+
def test_really_large_in_arr(large_val, signed, transform,
238+
multiple_elts, errors):
239+
# see gh-24910
240+
kwargs = dict(errors=errors) if errors is not None else dict()
241+
val = -large_val if signed else large_val
242+
val = transform(val)
243+
244+
extra_elt = "string"
245+
arr = [val] + multiple_elts * [extra_elt]
246+
247+
val_is_string = isinstance(val, str)
248+
coercing = errors == "coerce"
249+
250+
if errors in (None, "raise") and (val_is_string or multiple_elts):
251+
if val_is_string:
252+
msg = "Integer out of range. at position 0"
253+
else:
254+
msg = 'Unable to parse string "string" at position 1'
255+
256+
with pytest.raises(ValueError, match=msg):
257+
to_numeric(arr, **kwargs)
258+
else:
259+
result = to_numeric(arr, **kwargs)
260+
261+
exp_val = float(val) if (coercing and val_is_string) else val
262+
expected = [exp_val]
263+
264+
if multiple_elts:
265+
if coercing:
266+
expected.append(np.nan)
267+
exp_dtype = float
268+
else:
269+
expected.append(extra_elt)
270+
exp_dtype = object
271+
else:
272+
exp_dtype = float if isinstance(exp_val, (int, float)) else object
273+
274+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
275+
276+
277+
def test_really_large_in_arr_consistent(large_val, signed,
278+
multiple_elts, errors):
279+
# see gh-24910
280+
#
281+
# Even if we discover that we have to hold float, does not mean
282+
# we should be lenient on subsequent elements that fail to be integer.
283+
kwargs = dict(errors=errors) if errors is not None else dict()
284+
arr = [str(-large_val if signed else large_val)]
285+
286+
if multiple_elts:
287+
arr.insert(0, large_val)
288+
289+
if errors in (None, "raise"):
290+
index = int(multiple_elts)
291+
msg = "Integer out of range. at position {index}".format(index=index)
292+
293+
with pytest.raises(ValueError, match=msg):
294+
to_numeric(arr, **kwargs)
295+
else:
296+
result = to_numeric(arr, **kwargs)
297+
298+
if errors == "coerce":
299+
expected = [float(i) for i in arr]
300+
exp_dtype = float
301+
else:
302+
expected = arr
303+
exp_dtype = object
304+
305+
tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
191306

192307

193308
@pytest.mark.parametrize("errors,checker", [

0 commit comments

Comments
 (0)