Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix overflow bugs in date_Range #24255

Merged
merged 12 commits into from
Dec 23, 2018
188 changes: 188 additions & 0 deletions pandas/core/arrays/_ranges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# -*- coding: utf-8 -*-
"""
Helper functions to generate range-like data for DatetimeArray
(and possibly TimedeltaArray/PeriodArray)
"""

import numpy as np

from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp

from pandas.tseries.offsets import Tick, generate_range


def generate_regular_range(start, end, periods, freq):
"""
Generate a range of dates with the spans between dates described by
the given `freq` DateOffset.

Parameters
----------
start : Timestamp or None
first point of produced date range
end : Timestamp or None
last point of produced date range
periods : int
number of periods in produced date range
freq : DateOffset
describes space between dates in produced date range

Returns
-------
ndarray[np.int64] representing nanosecond unix timestamps
"""
if isinstance(freq, Tick):
stride = freq.nanos
if periods is None:
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = (b + (Timestamp(end).value - b) // stride * stride +
stride // 2 + 1)
# end.tz == start.tz by this point due to _generate implementation
tz = start.tz
elif start is not None:
b = Timestamp(start).value
e = _generate_range_overflow_safe(b, periods, stride, side='start')
tz = start.tz
elif end is not None:
e = Timestamp(end).value + stride
b = _generate_range_overflow_safe(e, periods, stride, side='end')
tz = end.tz
else:
raise ValueError("at least 'start' or 'end' should be specified "
"if a 'period' is given.")

with np.errstate(over="raise"):
# If the range is sufficiently large, np.arange may overflow
# and incorrectly return an empty array if not caught.
try:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment on why you need this

values = np.arange(b, e, stride, dtype=np.int64)
except FloatingPointError:
xdr = [b]
while xdr[-1] != e:
xdr.append(xdr[-1] + stride)
values = np.array(xdr[:-1], dtype=np.int64)

else:
tz = None
# start and end should have the same timezone by this point
if start is not None:
tz = start.tz
elif end is not None:
tz = end.tz

xdr = generate_range(start=start, end=end,
periods=periods, offset=freq)

values = np.array([x.value for x in xdr], dtype=np.int64)

return values, tz


def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.

Parameters
----------
endpoint : int
nanosecond timestamp of the known endpoint of the desired range
periods : int
number of periods in the desired range
stride : int
nanoseconds between periods in the desired range
side : {'start', 'end'}
which end of the range `endpoint` refers to

Returns
-------
other_end : int

Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ['start', 'end']

i64max = np.uint64(np.iinfo(np.int64).max)
msg = ('Cannot generate range with {side}={endpoint} and '
'periods={periods}'
.format(side=side, endpoint=endpoint, periods=periods))

with np.errstate(over="raise"):
# if periods * strides cannot be multiplied within the *uint64* bounds,
# we cannot salvage the operation by recursing, so raise
try:
addend = np.uint64(periods) * np.uint64(np.abs(stride))
except FloatingPointError:
raise OutOfBoundsDatetime(msg)

if np.abs(addend) <= i64max:
# relatively easy case without casting concerns
return _generate_range_overflow_safe_signed(
endpoint, periods, stride, side)

elif ((endpoint > 0 and side == 'start' and stride > 0) or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this case already hendled in _generate_range_overflow_safe_signed? l169

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. The check on 169 is specific to the case where periods * stride doesn't overflow int64 bounds. The check here on 127 is for the case where it does.

That said, this check could be removed. It is a fast-path to fail early.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k make sure that all of these paths are hit and tested - make as simple as possible

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just double-checked: every branch in here is hit in the tests

(endpoint < 0 and side == 'end' and stride > 0)):
# no chance of not-overflowing
raise OutOfBoundsDatetime(msg)

elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):
# in _generate_regular_range we added `stride` thereby overflowing
# the bounds. Adjust to fix this.
return _generate_range_overflow_safe(endpoint - stride,
periods - 1, stride, side)

# split into smaller pieces
mid_periods = periods // 2
remaining = periods - mid_periods
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)

midpoint = _generate_range_overflow_safe(endpoint, mid_periods,
stride, side)
return _generate_range_overflow_safe(midpoint, remaining, stride, side)


def _generate_range_overflow_safe_signed(endpoint, periods, stride, side):
"""
A special case for _generate_range_overflow_safe where `periods * stride`
can be calculated without overflowing int64 bounds.
"""
assert side in ['start', 'end']
if side == 'end':
stride *= -1

with np.errstate(over="raise"):
addend = np.int64(periods) * np.int64(stride)
try:
# easy case with no overflows
return np.int64(endpoint) + addend
except (FloatingPointError, OverflowError):
# with endpoint negative and addend positive we risk
# FloatingPointError; with reversed signed we risk OverflowError
pass

# if stride and endpoint had opposite signs, then endpoint + addend
# should never overflow. so they must have the same signs
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)

if stride > 0:
# watch out for very special case in which we just slightly
# exceed implementation bounds, but when passing the result to
# np.arange will get a result slightly within the bounds
assert endpoint >= 0
result = np.uint64(endpoint) + np.uint64(addend)
i64max = np.uint64(np.iinfo(np.int64).max)
assert result > i64max
if result <= i64max + np.uint64(stride):
return result

raise OutOfBoundsDatetime('Cannot generate range with '
'{side}={endpoint} and '
'periods={periods}'
.format(side=side, endpoint=endpoint,
periods=periods))
107 changes: 4 additions & 103 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@
from pandas.core import ops
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays._ranges import generate_regular_range
import pandas.core.common as com

from pandas.tseries.frequencies import get_period_alias, to_offset
from pandas.tseries.offsets import Day, Tick, generate_range
from pandas.tseries.offsets import Day, Tick

_midnight = time(0, 0)

Expand Down Expand Up @@ -306,7 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
if end is not None:
end = end.tz_localize(None)
# TODO: consider re-implementing _cached_range; GH#17914
index = _generate_regular_range(cls, start, end, periods, freq)
values, _tz = generate_regular_range(start, end, periods, freq)
index = cls._simple_new(values, freq=freq, tz=_tz)

if tz is not None and index.tz is None:
arr = conversion.tz_localize_to_utc(
Expand Down Expand Up @@ -1715,107 +1717,6 @@ def maybe_convert_dtype(data, copy):
return data, copy


def _generate_regular_range(cls, start, end, periods, freq):
"""
Generate a range of dates with the spans between dates described by
the given `freq` DateOffset.

Parameters
----------
cls : class
start : Timestamp or None
first point of produced date range
end : Timestamp or None
last point of produced date range
periods : int
number of periods in produced date range
freq : DateOffset
describes space between dates in produced date range

Returns
-------
ndarray[np.int64] representing nanosecond unix timestamps

"""
if isinstance(freq, Tick):
stride = freq.nanos
if periods is None:
b = Timestamp(start).value
# cannot just use e = Timestamp(end) + 1 because arange breaks when
# stride is too large, see GH10887
e = (b + (Timestamp(end).value - b) // stride * stride +
stride // 2 + 1)
# end.tz == start.tz by this point due to _generate implementation
tz = start.tz
elif start is not None:
b = Timestamp(start).value
e = _generate_range_overflow_safe(b, periods, stride, side='start')
tz = start.tz
elif end is not None:
e = Timestamp(end).value + stride
b = _generate_range_overflow_safe(e, periods, stride, side='end')
tz = end.tz
else:
raise ValueError("at least 'start' or 'end' should be specified "
"if a 'period' is given.")

values = np.arange(b, e, stride, dtype=np.int64)

else:
tz = None
# start and end should have the same timezone by this point
if start is not None:
tz = start.tz
elif end is not None:
tz = end.tz

xdr = generate_range(start=start, end=end,
periods=periods, offset=freq)

values = np.array([x.value for x in xdr], dtype=np.int64)

data = cls._simple_new(values, freq=freq, tz=tz)
return data


def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
"""
Calculate the second endpoint for passing to np.arange, checking
to avoid an integer overflow. Catch OverflowError and re-raise
as OutOfBoundsDatetime.

Parameters
----------
endpoint : int
periods : int
stride : int
side : {'start', 'end'}

Returns
-------
other_end : int

Raises
------
OutOfBoundsDatetime
"""
# GH#14187 raise instead of incorrectly wrapping around
assert side in ['start', 'end']
if side == 'end':
stride *= -1

try:
other_end = checked_add_with_arr(np.int64(endpoint),
np.int64(periods) * stride)
except OverflowError:
raise tslib.OutOfBoundsDatetime('Cannot generate range with '
'{side}={endpoint} and '
'periods={periods}'
.format(side=side, endpoint=endpoint,
periods=periods))
return other_end


# -------------------------------------------------------------------
# Validation and Inference

Expand Down
61 changes: 61 additions & 0 deletions pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,67 @@ def test_date_range_nat(self):
with pytest.raises(ValueError, match=msg):
date_range(start=pd.NaT, end='2016-01-01', freq='D')

def test_date_range_multiplication_overflow(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so 2 test cases fully exercise all of this code???

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call, I'm tracking down untested cases now.

# GH#24255
# check that overflows in calculating `addend = periods * stride`
# are caught
with tm.assert_produces_warning(None):
# we should _not_ be seeing a overflow RuntimeWarning
dti = date_range(start='1677-09-22', periods=213503, freq='D')

assert dti[0] == Timestamp('1677-09-22')
assert len(dti) == 213503

msg = "Cannot generate range with"
with pytest.raises(OutOfBoundsDatetime, match=msg):
date_range('1969-05-04', periods=200000000, freq='30000D')

def test_date_range_unsigned_overflow_handling(self):
# GH#24255
# case where `addend = periods * stride` overflows int64 bounds
# but not uint64 bounds
dti = date_range(start='1677-09-22', end='2262-04-11', freq='D')

dti2 = date_range(start=dti[0], periods=len(dti), freq='D')
assert dti2.equals(dti)

dti3 = date_range(end=dti[-1], periods=len(dti), freq='D')
assert dti3.equals(dti)

def test_date_range_int64_overflow_non_recoverable(self):
# GH#24255
# case with start later than 1970-01-01, overflow int64 but not uint64
msg = "Cannot generate range with"
with pytest.raises(OutOfBoundsDatetime, match=msg):
date_range(start='1970-02-01', periods=106752 * 24, freq='H')

# case with end before 1970-01-01, overflow int64 but not uint64
with pytest.raises(OutOfBoundsDatetime, match=msg):
date_range(end='1969-11-14', periods=106752 * 24, freq='H')

def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
# cases where stride * periods overflow int64 and stride/endpoint
# have different signs
start = Timestamp('2262-02-23')
end = Timestamp('1969-11-14')

expected = date_range(start=start, end=end, freq='-1H')
assert expected[0] == start
assert expected[-1] == end

dti = date_range(end=end, periods=len(expected), freq='-1H')
tm.assert_index_equal(dti, expected)

start2 = Timestamp('1970-02-01')
end2 = Timestamp('1677-10-22')

expected2 = date_range(start=start2, end=end2, freq='-1H')
assert expected2[0] == start2
assert expected2[-1] == end2

dti2 = date_range(start=start2, periods=len(expected2), freq='-1H')
tm.assert_index_equal(dti2, expected2)

def test_date_range_out_of_bounds(self):
# GH#14187
with pytest.raises(OutOfBoundsDatetime):
Expand Down