-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: Fix overflow bugs in date_Range #24255
Changes from all commits
db4da3b
87acff1
67f98f1
1bf5642
9460f31
e825a08
9af17d4
676d6e0
40c2f5b
018ccca
03a90c4
43d3d83
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Helper functions to generate range-like data for DatetimeArray | ||
(and possibly TimedeltaArray/PeriodArray) | ||
""" | ||
|
||
import numpy as np | ||
|
||
from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp | ||
|
||
from pandas.tseries.offsets import Tick, generate_range | ||
|
||
|
||
def generate_regular_range(start, end, periods, freq): | ||
""" | ||
Generate a range of dates with the spans between dates described by | ||
the given `freq` DateOffset. | ||
|
||
Parameters | ||
---------- | ||
start : Timestamp or None | ||
first point of produced date range | ||
end : Timestamp or None | ||
last point of produced date range | ||
periods : int | ||
number of periods in produced date range | ||
freq : DateOffset | ||
describes space between dates in produced date range | ||
|
||
Returns | ||
------- | ||
ndarray[np.int64] representing nanosecond unix timestamps | ||
""" | ||
if isinstance(freq, Tick): | ||
stride = freq.nanos | ||
if periods is None: | ||
b = Timestamp(start).value | ||
# cannot just use e = Timestamp(end) + 1 because arange breaks when | ||
# stride is too large, see GH10887 | ||
e = (b + (Timestamp(end).value - b) // stride * stride + | ||
stride // 2 + 1) | ||
# end.tz == start.tz by this point due to _generate implementation | ||
tz = start.tz | ||
elif start is not None: | ||
b = Timestamp(start).value | ||
e = _generate_range_overflow_safe(b, periods, stride, side='start') | ||
tz = start.tz | ||
elif end is not None: | ||
e = Timestamp(end).value + stride | ||
b = _generate_range_overflow_safe(e, periods, stride, side='end') | ||
tz = end.tz | ||
else: | ||
raise ValueError("at least 'start' or 'end' should be specified " | ||
"if a 'period' is given.") | ||
|
||
with np.errstate(over="raise"): | ||
# If the range is sufficiently large, np.arange may overflow | ||
# and incorrectly return an empty array if not caught. | ||
try: | ||
values = np.arange(b, e, stride, dtype=np.int64) | ||
except FloatingPointError: | ||
xdr = [b] | ||
while xdr[-1] != e: | ||
xdr.append(xdr[-1] + stride) | ||
values = np.array(xdr[:-1], dtype=np.int64) | ||
|
||
else: | ||
tz = None | ||
# start and end should have the same timezone by this point | ||
if start is not None: | ||
tz = start.tz | ||
elif end is not None: | ||
tz = end.tz | ||
|
||
xdr = generate_range(start=start, end=end, | ||
periods=periods, offset=freq) | ||
|
||
values = np.array([x.value for x in xdr], dtype=np.int64) | ||
|
||
return values, tz | ||
|
||
|
||
def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): | ||
""" | ||
Calculate the second endpoint for passing to np.arange, checking | ||
to avoid an integer overflow. Catch OverflowError and re-raise | ||
as OutOfBoundsDatetime. | ||
|
||
Parameters | ||
---------- | ||
endpoint : int | ||
nanosecond timestamp of the known endpoint of the desired range | ||
periods : int | ||
number of periods in the desired range | ||
stride : int | ||
nanoseconds between periods in the desired range | ||
side : {'start', 'end'} | ||
which end of the range `endpoint` refers to | ||
|
||
Returns | ||
------- | ||
other_end : int | ||
|
||
Raises | ||
------ | ||
OutOfBoundsDatetime | ||
""" | ||
# GH#14187 raise instead of incorrectly wrapping around | ||
assert side in ['start', 'end'] | ||
|
||
i64max = np.uint64(np.iinfo(np.int64).max) | ||
msg = ('Cannot generate range with {side}={endpoint} and ' | ||
'periods={periods}' | ||
.format(side=side, endpoint=endpoint, periods=periods)) | ||
|
||
with np.errstate(over="raise"): | ||
# if periods * strides cannot be multiplied within the *uint64* bounds, | ||
# we cannot salvage the operation by recursing, so raise | ||
try: | ||
addend = np.uint64(periods) * np.uint64(np.abs(stride)) | ||
except FloatingPointError: | ||
raise OutOfBoundsDatetime(msg) | ||
|
||
if np.abs(addend) <= i64max: | ||
# relatively easy case without casting concerns | ||
return _generate_range_overflow_safe_signed( | ||
endpoint, periods, stride, side) | ||
|
||
elif ((endpoint > 0 and side == 'start' and stride > 0) or | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. isn't this case already hendled in _generate_range_overflow_safe_signed? l169 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. The check on 169 is specific to the case where periods * stride doesn't overflow int64 bounds. The check here on 127 is for the case where it does. That said, this check could be removed. It is a fast-path to fail early. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. k make sure that all of these paths are hit and tested - make as simple as possible There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just double-checked: every branch in here is hit in the tests |
||
(endpoint < 0 and side == 'end' and stride > 0)): | ||
# no chance of not-overflowing | ||
raise OutOfBoundsDatetime(msg) | ||
|
||
elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): | ||
# in _generate_regular_range we added `stride` thereby overflowing | ||
# the bounds. Adjust to fix this. | ||
return _generate_range_overflow_safe(endpoint - stride, | ||
periods - 1, stride, side) | ||
|
||
# split into smaller pieces | ||
mid_periods = periods // 2 | ||
remaining = periods - mid_periods | ||
assert 0 < remaining < periods, (remaining, periods, endpoint, stride) | ||
|
||
midpoint = _generate_range_overflow_safe(endpoint, mid_periods, | ||
stride, side) | ||
return _generate_range_overflow_safe(midpoint, remaining, stride, side) | ||
|
||
|
||
def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): | ||
""" | ||
A special case for _generate_range_overflow_safe where `periods * stride` | ||
can be calculated without overflowing int64 bounds. | ||
""" | ||
assert side in ['start', 'end'] | ||
if side == 'end': | ||
stride *= -1 | ||
|
||
with np.errstate(over="raise"): | ||
addend = np.int64(periods) * np.int64(stride) | ||
try: | ||
# easy case with no overflows | ||
return np.int64(endpoint) + addend | ||
except (FloatingPointError, OverflowError): | ||
# with endpoint negative and addend positive we risk | ||
# FloatingPointError; with reversed signed we risk OverflowError | ||
pass | ||
|
||
# if stride and endpoint had opposite signs, then endpoint + addend | ||
# should never overflow. so they must have the same signs | ||
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) | ||
|
||
if stride > 0: | ||
# watch out for very special case in which we just slightly | ||
# exceed implementation bounds, but when passing the result to | ||
# np.arange will get a result slightly within the bounds | ||
assert endpoint >= 0 | ||
result = np.uint64(endpoint) + np.uint64(addend) | ||
i64max = np.uint64(np.iinfo(np.int64).max) | ||
assert result > i64max | ||
if result <= i64max + np.uint64(stride): | ||
return result | ||
|
||
raise OutOfBoundsDatetime('Cannot generate range with ' | ||
'{side}={endpoint} and ' | ||
'periods={periods}' | ||
.format(side=side, endpoint=endpoint, | ||
periods=periods)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -88,6 +88,67 @@ def test_date_range_nat(self): | |
with pytest.raises(ValueError, match=msg): | ||
date_range(start=pd.NaT, end='2016-01-01', freq='D') | ||
|
||
def test_date_range_multiplication_overflow(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so 2 test cases fully exercise all of this code??? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good call, I'm tracking down untested cases now. |
||
# GH#24255 | ||
# check that overflows in calculating `addend = periods * stride` | ||
# are caught | ||
with tm.assert_produces_warning(None): | ||
# we should _not_ be seeing a overflow RuntimeWarning | ||
dti = date_range(start='1677-09-22', periods=213503, freq='D') | ||
|
||
assert dti[0] == Timestamp('1677-09-22') | ||
assert len(dti) == 213503 | ||
|
||
msg = "Cannot generate range with" | ||
with pytest.raises(OutOfBoundsDatetime, match=msg): | ||
date_range('1969-05-04', periods=200000000, freq='30000D') | ||
|
||
def test_date_range_unsigned_overflow_handling(self): | ||
# GH#24255 | ||
# case where `addend = periods * stride` overflows int64 bounds | ||
# but not uint64 bounds | ||
dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') | ||
|
||
dti2 = date_range(start=dti[0], periods=len(dti), freq='D') | ||
assert dti2.equals(dti) | ||
|
||
dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') | ||
assert dti3.equals(dti) | ||
|
||
def test_date_range_int64_overflow_non_recoverable(self): | ||
# GH#24255 | ||
# case with start later than 1970-01-01, overflow int64 but not uint64 | ||
msg = "Cannot generate range with" | ||
with pytest.raises(OutOfBoundsDatetime, match=msg): | ||
date_range(start='1970-02-01', periods=106752 * 24, freq='H') | ||
|
||
# case with end before 1970-01-01, overflow int64 but not uint64 | ||
with pytest.raises(OutOfBoundsDatetime, match=msg): | ||
date_range(end='1969-11-14', periods=106752 * 24, freq='H') | ||
|
||
def test_date_range_int64_overflow_stride_endpoint_different_signs(self): | ||
# cases where stride * periods overflow int64 and stride/endpoint | ||
# have different signs | ||
start = Timestamp('2262-02-23') | ||
end = Timestamp('1969-11-14') | ||
|
||
expected = date_range(start=start, end=end, freq='-1H') | ||
assert expected[0] == start | ||
assert expected[-1] == end | ||
|
||
dti = date_range(end=end, periods=len(expected), freq='-1H') | ||
tm.assert_index_equal(dti, expected) | ||
|
||
start2 = Timestamp('1970-02-01') | ||
end2 = Timestamp('1677-10-22') | ||
|
||
expected2 = date_range(start=start2, end=end2, freq='-1H') | ||
assert expected2[0] == start2 | ||
assert expected2[-1] == end2 | ||
|
||
dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') | ||
tm.assert_index_equal(dti2, expected2) | ||
|
||
def test_date_range_out_of_bounds(self): | ||
# GH#14187 | ||
with pytest.raises(OutOfBoundsDatetime): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment on why you need this