Skip to content

Commit 17d7ddb

Browse files
gfyoungjreback
authored andcommitted
BUG: Convert uint64 in maybe_convert_numeric
Add handling for `uint64` elements in an array with the follow behavior specifications: 1) If `uint64` and `NaN` are both detected, the original input will be returned if `coerce_numeric` is `False`. Otherwise, an `Exception` is raised. 2) If `uint64` and negative numbers are both detected, the original input be returned if `coerce_numeric` is `False`. Otherwise, an `Exception` is raised. Closes pandas-dev#14982. Partial fix for pandas-dev#14983. Author: gfyoung <gfyoung17@gmail.com> Closes pandas-dev#15005 from gfyoung/maybe-convert-numeric-uint64 and squashes the following commits: c3bd28a [gfyoung] BUG: Convert uint64 in maybe_convert_numeric
1 parent 533e7de commit 17d7ddb

File tree

5 files changed

+265
-24
lines changed

5 files changed

+265
-24
lines changed

asv_bench/benchmarks/inference.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,23 @@ def setup(self, dtype, downcast):
9595
self.data = self.data_dict[dtype]
9696

9797
def time_downcast(self, dtype, downcast):
98-
pd.to_numeric(self.data, downcast=downcast)
98+
pd.to_numeric(self.data, downcast=downcast)
99+
100+
101+
class MaybeConvertNumeric(object):
102+
103+
def setup(self):
104+
n = 1000000
105+
arr = np.repeat([2**63], n)
106+
arr = arr + np.arange(n).astype('uint64')
107+
arr = np.array([arr[i] if i%2 == 0 else
108+
str(arr[i]) for i in range(n)],
109+
dtype=object)
110+
111+
arr[-1] = -1
112+
self.data = arr
113+
self.na_values = set()
114+
115+
def time_convert(self):
116+
pd.lib.maybe_convert_numeric(self.data, self.na_values,
117+
coerce_numeric=False)

doc/source/whatsnew/v0.20.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -319,5 +319,5 @@ Bug Fixes
319319

320320

321321
- Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`)
322-
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`)
322+
- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`)
323323
- Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)

pandas/io/tests/parser/common.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -944,26 +944,39 @@ def test_int64_overflow(self):
944944
00013007854817840017963235
945945
00013007854817840018860166"""
946946

947+
# 13007854817840016671868 > UINT64_MAX, so this
948+
# will overflow and return object as the dtype.
947949
result = self.read_csv(StringIO(data))
948950
self.assertTrue(result['ID'].dtype == object)
949951

950-
self.assertRaises(OverflowError, self.read_csv,
951-
StringIO(data), converters={'ID': np.int64})
952+
# 13007854817840016671868 > UINT64_MAX, so attempts
953+
# to cast to either int64 or uint64 will result in
954+
# an OverflowError being raised.
955+
for conv in (np.int64, np.uint64):
956+
self.assertRaises(OverflowError, self.read_csv,
957+
StringIO(data), converters={'ID': conv})
952958

953-
# Just inside int64 range: parse as integer
959+
# These numbers fall right inside the int64 range,
960+
# so they should be parsed as string.
954961
i_max = np.iinfo(np.int64).max
955962
i_min = np.iinfo(np.int64).min
963+
956964
for x in [i_max, i_min]:
957965
result = self.read_csv(StringIO(str(x)), header=None)
958966
expected = DataFrame([x])
959967
tm.assert_frame_equal(result, expected)
960968

961-
# Just outside int64 range: parse as string
969+
# These numbers fall just outside the int64 range,
970+
# so they should be parsed as string.
962971
too_big = i_max + 1
963972
too_small = i_min - 1
973+
964974
for x in [too_big, too_small]:
965975
result = self.read_csv(StringIO(str(x)), header=None)
966-
expected = DataFrame([str(x)])
976+
if self.engine == 'python' and x == too_big:
977+
expected = DataFrame([x])
978+
else:
979+
expected = DataFrame([str(x)])
967980
tm.assert_frame_equal(result, expected)
968981

969982
def test_empty_with_nrows_chunksize(self):

pandas/src/inference.pyx

+173-17
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX,
1313

1414
# core.common import for fast inference checks
1515

16-
npy_int64_max = np.iinfo(np.int64).max
17-
18-
1916
cpdef bint is_float(object obj):
2017
return util.is_float_object(obj)
2118

@@ -629,21 +626,100 @@ cdef extern from "parse_helper.h":
629626

630627
cdef int64_t iINT64_MAX = <int64_t> INT64_MAX
631628
cdef int64_t iINT64_MIN = <int64_t> INT64_MIN
629+
cdef uint64_t iUINT64_MAX = <uint64_t> UINT64_MAX
630+
631+
632+
cdef inline bint _check_uint64_nan(bint seen_uint, bint seen_null,
633+
bint coerce_numeric) except -1:
634+
"""
635+
Check whether we have encountered uint64 when handling a NaN element.
636+
637+
If uint64 has been encountered, we cannot safely cast to float64 due
638+
to truncation problems (this would occur if we return a numeric array
639+
containing a NaN element).
640+
641+
Returns
642+
-------
643+
return_values : bool
644+
Whether or not we should return the original input array to avoid
645+
data truncation.
646+
"""
647+
if seen_null and seen_uint:
648+
if not coerce_numeric:
649+
return True
650+
else:
651+
raise ValueError("uint64 array detected, and such an "
652+
"array cannot contain NaN.")
653+
654+
return False
632655

633656

634-
def maybe_convert_numeric(object[:] values, set na_values,
657+
cdef inline bint _check_uint64_int64_conflict(bint seen_sint, bint seen_uint,
658+
bint coerce_numeric) except -1:
659+
"""
660+
Check whether we have encountered both int64 and uint64 elements.
661+
662+
If both have been encountered, we cannot safely cast to an integer
663+
dtype since none is large enough to hold both types of elements.
664+
665+
Returns
666+
-------
667+
return_values : bool
668+
Whether or not we should return the original input array to avoid
669+
data truncation.
670+
"""
671+
if seen_sint and seen_uint:
672+
if not coerce_numeric:
673+
return True
674+
else:
675+
raise ValueError("uint64 and negative values detected. "
676+
"Cannot safely return a numeric array "
677+
"without truncating data.")
678+
679+
return False
680+
681+
682+
def maybe_convert_numeric(ndarray[object] values, set na_values,
635683
bint convert_empty=True, bint coerce_numeric=False):
636684
"""
637-
Type inference function-- convert strings to numeric (potentially) and
638-
convert to proper dtype array
685+
Convert object array to a numeric array if possible.
686+
687+
Parameters
688+
----------
689+
values : ndarray
690+
Array of object elements to convert.
691+
na_values : set
692+
Set of values that should be interpreted as NaN.
693+
convert_empty : bool, default True
694+
If an empty array-like object is encountered, whether to interpret
695+
that element as NaN or not. If set to False, a ValueError will be
696+
raised if such an element is encountered and 'coerce_numeric' is False.
697+
coerce_numeric : bool, default False
698+
If initial attempts to convert to numeric have failed, whether to
699+
force conversion to numeric via alternative methods or by setting the
700+
element to NaN. Otherwise, an Exception will be raised when such an
701+
element is encountered.
702+
703+
This boolean also has an impact on how conversion behaves when a
704+
numeric array has no suitable numerical dtype to return (i.e. uint64,
705+
int32, uint8). If set to False, the original object array will be
706+
returned. Otherwise, a ValueError will be raised.
707+
708+
Returns
709+
-------
710+
numeric_array : array of converted object values to numerical ones
639711
"""
640712
cdef:
641713
int status, maybe_int
642714
Py_ssize_t i, n = values.size
643715
ndarray[float64_t] floats = np.empty(n, dtype='f8')
644716
ndarray[complex128_t] complexes = np.empty(n, dtype='c16')
645717
ndarray[int64_t] ints = np.empty(n, dtype='i8')
718+
ndarray[uint64_t] uints = np.empty(n, dtype='u8')
646719
ndarray[uint8_t] bools = np.empty(n, dtype='u1')
720+
bint seen_null = False
721+
bint seen_uint = False
722+
bint seen_sint = False
647723
bint seen_float = False
648724
bint seen_complex = False
649725
bint seen_int = False
@@ -655,22 +731,60 @@ def maybe_convert_numeric(object[:] values, set na_values,
655731
val = values[i]
656732

657733
if val.__hash__ is not None and val in na_values:
734+
seen_null = True
735+
if _check_uint64_nan(seen_uint, seen_null,
736+
coerce_numeric):
737+
return values
738+
658739
floats[i] = complexes[i] = nan
659740
seen_float = True
660741
elif util.is_float_object(val):
742+
if val != val:
743+
seen_null = True
744+
if _check_uint64_nan(seen_uint, seen_null,
745+
coerce_numeric):
746+
return values
747+
661748
floats[i] = complexes[i] = val
662749
seen_float = True
663750
elif util.is_integer_object(val):
664-
floats[i] = ints[i] = val
751+
floats[i] = complexes[i] = val
752+
as_int = int(val)
665753
seen_int = True
754+
755+
seen_uint = seen_uint or (as_int > iINT64_MAX)
756+
seen_sint = seen_sint or (as_int < 0)
757+
758+
if (_check_uint64_nan(seen_uint, seen_null, coerce_numeric) or
759+
_check_uint64_int64_conflict(seen_sint, seen_uint,
760+
coerce_numeric)):
761+
return values
762+
763+
if seen_uint:
764+
uints[i] = as_int
765+
elif seen_sint:
766+
ints[i] = as_int
767+
else:
768+
uints[i] = as_int
769+
ints[i] = as_int
666770
elif util.is_bool_object(val):
667-
floats[i] = ints[i] = bools[i] = val
771+
floats[i] = uints[i] = ints[i] = bools[i] = val
668772
seen_bool = True
669773
elif val is None:
774+
seen_null = True
775+
if _check_uint64_nan(seen_uint, seen_null,
776+
coerce_numeric):
777+
return values
778+
670779
floats[i] = complexes[i] = nan
671780
seen_float = True
672781
elif hasattr(val, '__len__') and len(val) == 0:
673782
if convert_empty or coerce_numeric:
783+
seen_null = True
784+
if _check_uint64_nan(seen_uint, seen_null,
785+
coerce_numeric):
786+
return values
787+
674788
floats[i] = complexes[i] = nan
675789
seen_float = True
676790
else:
@@ -686,24 +800,61 @@ def maybe_convert_numeric(object[:] values, set na_values,
686800
status = floatify(val, &fval, &maybe_int)
687801

688802
if fval in na_values:
803+
seen_null = True
804+
if _check_uint64_nan(seen_uint, seen_null,
805+
coerce_numeric):
806+
return values
807+
689808
floats[i] = complexes[i] = nan
690809
seen_float = True
691810
else:
811+
if fval != fval:
812+
seen_null = True
813+
if _check_uint64_nan(seen_uint, seen_null,
814+
coerce_numeric):
815+
return values
816+
692817
floats[i] = fval
693818

694-
if not seen_float:
695-
if maybe_int:
696-
as_int = int(val)
819+
if maybe_int:
820+
as_int = int(val)
697821

698-
if as_int <= iINT64_MAX and as_int >= iINT64_MIN:
822+
if as_int in na_values:
823+
seen_float = True
824+
seen_null = True
825+
else:
826+
seen_uint = seen_uint or (as_int > iINT64_MAX)
827+
seen_sint = seen_sint or (as_int < 0)
828+
seen_int = True
829+
830+
if (_check_uint64_nan(seen_uint, seen_null,
831+
coerce_numeric) or
832+
_check_uint64_int64_conflict(seen_sint, seen_uint,
833+
coerce_numeric)):
834+
return values
835+
836+
if not (seen_float or as_int in na_values):
837+
if as_int < iINT64_MIN or as_int > iUINT64_MAX:
838+
raise ValueError('Integer out of range.')
839+
840+
if seen_uint:
841+
uints[i] = as_int
842+
elif seen_sint:
699843
ints[i] = as_int
700844
else:
701-
raise ValueError('integer out of range')
702-
else:
703-
seen_float = True
845+
uints[i] = as_int
846+
ints[i] = as_int
847+
else:
848+
seen_float = True
704849
except (TypeError, ValueError) as e:
705850
if not coerce_numeric:
706851
raise type(e)(str(e) + ' at position {}'.format(i))
852+
elif "uint64" in str(e): # Exception from check functions.
853+
raise
854+
seen_null = True
855+
if _check_uint64_nan(seen_uint, seen_null,
856+
coerce_numeric):
857+
return values
707858

708859
floats[i] = nan
709860
seen_float = True
@@ -713,9 +864,14 @@ def maybe_convert_numeric(object[:] values, set na_values,
713864
elif seen_float:
714865
return floats
715866
elif seen_int:
716-
return ints
867+
if seen_uint:
868+
return uints
869+
else:
870+
return ints
717871
elif seen_bool:
718872
return bools.view(np.bool_)
873+
elif seen_uint:
874+
return uints
719875
return ints
720876

721877

@@ -810,7 +966,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
810966
floats[i] = <float64_t> val
811967
complexes[i] = <double complex> val
812968
if not seen_null:
813-
seen_uint = seen_uint or (int(val) > npy_int64_max)
969+
seen_uint = seen_uint or (int(val) > iINT64_MAX)
814970
seen_sint = seen_sint or (val < 0)
815971

816972
if seen_uint and seen_sint:

0 commit comments

Comments
 (0)