Skip to content

Commit 1cd077a

Browse files
jbrockmendeljreback
authored andcommitted
BUG: Fix overflow bugs in date_Range (#24255)
1 parent 0c593ae commit 1cd077a

File tree

3 files changed

+253
-103
lines changed

3 files changed

+253
-103
lines changed

pandas/core/arrays/_ranges.py

+188
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Helper functions to generate range-like data for DatetimeArray
4+
(and possibly TimedeltaArray/PeriodArray)
5+
"""
6+
7+
import numpy as np
8+
9+
from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp
10+
11+
from pandas.tseries.offsets import Tick, generate_range
12+
13+
14+
def generate_regular_range(start, end, periods, freq):
15+
"""
16+
Generate a range of dates with the spans between dates described by
17+
the given `freq` DateOffset.
18+
19+
Parameters
20+
----------
21+
start : Timestamp or None
22+
first point of produced date range
23+
end : Timestamp or None
24+
last point of produced date range
25+
periods : int
26+
number of periods in produced date range
27+
freq : DateOffset
28+
describes space between dates in produced date range
29+
30+
Returns
31+
-------
32+
ndarray[np.int64] representing nanosecond unix timestamps
33+
"""
34+
if isinstance(freq, Tick):
35+
stride = freq.nanos
36+
if periods is None:
37+
b = Timestamp(start).value
38+
# cannot just use e = Timestamp(end) + 1 because arange breaks when
39+
# stride is too large, see GH10887
40+
e = (b + (Timestamp(end).value - b) // stride * stride +
41+
stride // 2 + 1)
42+
# end.tz == start.tz by this point due to _generate implementation
43+
tz = start.tz
44+
elif start is not None:
45+
b = Timestamp(start).value
46+
e = _generate_range_overflow_safe(b, periods, stride, side='start')
47+
tz = start.tz
48+
elif end is not None:
49+
e = Timestamp(end).value + stride
50+
b = _generate_range_overflow_safe(e, periods, stride, side='end')
51+
tz = end.tz
52+
else:
53+
raise ValueError("at least 'start' or 'end' should be specified "
54+
"if a 'period' is given.")
55+
56+
with np.errstate(over="raise"):
57+
# If the range is sufficiently large, np.arange may overflow
58+
# and incorrectly return an empty array if not caught.
59+
try:
60+
values = np.arange(b, e, stride, dtype=np.int64)
61+
except FloatingPointError:
62+
xdr = [b]
63+
while xdr[-1] != e:
64+
xdr.append(xdr[-1] + stride)
65+
values = np.array(xdr[:-1], dtype=np.int64)
66+
67+
else:
68+
tz = None
69+
# start and end should have the same timezone by this point
70+
if start is not None:
71+
tz = start.tz
72+
elif end is not None:
73+
tz = end.tz
74+
75+
xdr = generate_range(start=start, end=end,
76+
periods=periods, offset=freq)
77+
78+
values = np.array([x.value for x in xdr], dtype=np.int64)
79+
80+
return values, tz
81+
82+
83+
def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
84+
"""
85+
Calculate the second endpoint for passing to np.arange, checking
86+
to avoid an integer overflow. Catch OverflowError and re-raise
87+
as OutOfBoundsDatetime.
88+
89+
Parameters
90+
----------
91+
endpoint : int
92+
nanosecond timestamp of the known endpoint of the desired range
93+
periods : int
94+
number of periods in the desired range
95+
stride : int
96+
nanoseconds between periods in the desired range
97+
side : {'start', 'end'}
98+
which end of the range `endpoint` refers to
99+
100+
Returns
101+
-------
102+
other_end : int
103+
104+
Raises
105+
------
106+
OutOfBoundsDatetime
107+
"""
108+
# GH#14187 raise instead of incorrectly wrapping around
109+
assert side in ['start', 'end']
110+
111+
i64max = np.uint64(np.iinfo(np.int64).max)
112+
msg = ('Cannot generate range with {side}={endpoint} and '
113+
'periods={periods}'
114+
.format(side=side, endpoint=endpoint, periods=periods))
115+
116+
with np.errstate(over="raise"):
117+
# if periods * strides cannot be multiplied within the *uint64* bounds,
118+
# we cannot salvage the operation by recursing, so raise
119+
try:
120+
addend = np.uint64(periods) * np.uint64(np.abs(stride))
121+
except FloatingPointError:
122+
raise OutOfBoundsDatetime(msg)
123+
124+
if np.abs(addend) <= i64max:
125+
# relatively easy case without casting concerns
126+
return _generate_range_overflow_safe_signed(
127+
endpoint, periods, stride, side)
128+
129+
elif ((endpoint > 0 and side == 'start' and stride > 0) or
130+
(endpoint < 0 and side == 'end' and stride > 0)):
131+
# no chance of not-overflowing
132+
raise OutOfBoundsDatetime(msg)
133+
134+
elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max):
135+
# in _generate_regular_range we added `stride` thereby overflowing
136+
# the bounds. Adjust to fix this.
137+
return _generate_range_overflow_safe(endpoint - stride,
138+
periods - 1, stride, side)
139+
140+
# split into smaller pieces
141+
mid_periods = periods // 2
142+
remaining = periods - mid_periods
143+
assert 0 < remaining < periods, (remaining, periods, endpoint, stride)
144+
145+
midpoint = _generate_range_overflow_safe(endpoint, mid_periods,
146+
stride, side)
147+
return _generate_range_overflow_safe(midpoint, remaining, stride, side)
148+
149+
150+
def _generate_range_overflow_safe_signed(endpoint, periods, stride, side):
151+
"""
152+
A special case for _generate_range_overflow_safe where `periods * stride`
153+
can be calculated without overflowing int64 bounds.
154+
"""
155+
assert side in ['start', 'end']
156+
if side == 'end':
157+
stride *= -1
158+
159+
with np.errstate(over="raise"):
160+
addend = np.int64(periods) * np.int64(stride)
161+
try:
162+
# easy case with no overflows
163+
return np.int64(endpoint) + addend
164+
except (FloatingPointError, OverflowError):
165+
# with endpoint negative and addend positive we risk
166+
# FloatingPointError; with reversed signed we risk OverflowError
167+
pass
168+
169+
# if stride and endpoint had opposite signs, then endpoint + addend
170+
# should never overflow. so they must have the same signs
171+
assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0)
172+
173+
if stride > 0:
174+
# watch out for very special case in which we just slightly
175+
# exceed implementation bounds, but when passing the result to
176+
# np.arange will get a result slightly within the bounds
177+
assert endpoint >= 0
178+
result = np.uint64(endpoint) + np.uint64(addend)
179+
i64max = np.uint64(np.iinfo(np.int64).max)
180+
assert result > i64max
181+
if result <= i64max + np.uint64(stride):
182+
return result
183+
184+
raise OutOfBoundsDatetime('Cannot generate range with '
185+
'{side}={endpoint} and '
186+
'periods={periods}'
187+
.format(side=side, endpoint=endpoint,
188+
periods=periods))

pandas/core/arrays/datetimes.py

+4-103
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@
2424
from pandas.core import ops
2525
from pandas.core.algorithms import checked_add_with_arr
2626
from pandas.core.arrays import datetimelike as dtl
27+
from pandas.core.arrays._ranges import generate_regular_range
2728
import pandas.core.common as com
2829

2930
from pandas.tseries.frequencies import get_period_alias, to_offset
30-
from pandas.tseries.offsets import Day, Tick, generate_range
31+
from pandas.tseries.offsets import Day, Tick
3132

3233
_midnight = time(0, 0)
3334

@@ -306,7 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
306307
if end is not None:
307308
end = end.tz_localize(None)
308309
# TODO: consider re-implementing _cached_range; GH#17914
309-
index = _generate_regular_range(cls, start, end, periods, freq)
310+
values, _tz = generate_regular_range(start, end, periods, freq)
311+
index = cls._simple_new(values, freq=freq, tz=_tz)
310312

311313
if tz is not None and index.tz is None:
312314
arr = conversion.tz_localize_to_utc(
@@ -1715,107 +1717,6 @@ def maybe_convert_dtype(data, copy):
17151717
return data, copy
17161718

17171719

1718-
def _generate_regular_range(cls, start, end, periods, freq):
1719-
"""
1720-
Generate a range of dates with the spans between dates described by
1721-
the given `freq` DateOffset.
1722-
1723-
Parameters
1724-
----------
1725-
cls : class
1726-
start : Timestamp or None
1727-
first point of produced date range
1728-
end : Timestamp or None
1729-
last point of produced date range
1730-
periods : int
1731-
number of periods in produced date range
1732-
freq : DateOffset
1733-
describes space between dates in produced date range
1734-
1735-
Returns
1736-
-------
1737-
ndarray[np.int64] representing nanosecond unix timestamps
1738-
1739-
"""
1740-
if isinstance(freq, Tick):
1741-
stride = freq.nanos
1742-
if periods is None:
1743-
b = Timestamp(start).value
1744-
# cannot just use e = Timestamp(end) + 1 because arange breaks when
1745-
# stride is too large, see GH10887
1746-
e = (b + (Timestamp(end).value - b) // stride * stride +
1747-
stride // 2 + 1)
1748-
# end.tz == start.tz by this point due to _generate implementation
1749-
tz = start.tz
1750-
elif start is not None:
1751-
b = Timestamp(start).value
1752-
e = _generate_range_overflow_safe(b, periods, stride, side='start')
1753-
tz = start.tz
1754-
elif end is not None:
1755-
e = Timestamp(end).value + stride
1756-
b = _generate_range_overflow_safe(e, periods, stride, side='end')
1757-
tz = end.tz
1758-
else:
1759-
raise ValueError("at least 'start' or 'end' should be specified "
1760-
"if a 'period' is given.")
1761-
1762-
values = np.arange(b, e, stride, dtype=np.int64)
1763-
1764-
else:
1765-
tz = None
1766-
# start and end should have the same timezone by this point
1767-
if start is not None:
1768-
tz = start.tz
1769-
elif end is not None:
1770-
tz = end.tz
1771-
1772-
xdr = generate_range(start=start, end=end,
1773-
periods=periods, offset=freq)
1774-
1775-
values = np.array([x.value for x in xdr], dtype=np.int64)
1776-
1777-
data = cls._simple_new(values, freq=freq, tz=tz)
1778-
return data
1779-
1780-
1781-
def _generate_range_overflow_safe(endpoint, periods, stride, side='start'):
1782-
"""
1783-
Calculate the second endpoint for passing to np.arange, checking
1784-
to avoid an integer overflow. Catch OverflowError and re-raise
1785-
as OutOfBoundsDatetime.
1786-
1787-
Parameters
1788-
----------
1789-
endpoint : int
1790-
periods : int
1791-
stride : int
1792-
side : {'start', 'end'}
1793-
1794-
Returns
1795-
-------
1796-
other_end : int
1797-
1798-
Raises
1799-
------
1800-
OutOfBoundsDatetime
1801-
"""
1802-
# GH#14187 raise instead of incorrectly wrapping around
1803-
assert side in ['start', 'end']
1804-
if side == 'end':
1805-
stride *= -1
1806-
1807-
try:
1808-
other_end = checked_add_with_arr(np.int64(endpoint),
1809-
np.int64(periods) * stride)
1810-
except OverflowError:
1811-
raise tslib.OutOfBoundsDatetime('Cannot generate range with '
1812-
'{side}={endpoint} and '
1813-
'periods={periods}'
1814-
.format(side=side, endpoint=endpoint,
1815-
periods=periods))
1816-
return other_end
1817-
1818-
18191720
# -------------------------------------------------------------------
18201721
# Validation and Inference
18211722

pandas/tests/indexes/datetimes/test_date_range.py

+61
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,67 @@ def test_date_range_nat(self):
8888
with pytest.raises(ValueError, match=msg):
8989
date_range(start=pd.NaT, end='2016-01-01', freq='D')
9090

91+
def test_date_range_multiplication_overflow(self):
92+
# GH#24255
93+
# check that overflows in calculating `addend = periods * stride`
94+
# are caught
95+
with tm.assert_produces_warning(None):
96+
# we should _not_ be seeing a overflow RuntimeWarning
97+
dti = date_range(start='1677-09-22', periods=213503, freq='D')
98+
99+
assert dti[0] == Timestamp('1677-09-22')
100+
assert len(dti) == 213503
101+
102+
msg = "Cannot generate range with"
103+
with pytest.raises(OutOfBoundsDatetime, match=msg):
104+
date_range('1969-05-04', periods=200000000, freq='30000D')
105+
106+
def test_date_range_unsigned_overflow_handling(self):
107+
# GH#24255
108+
# case where `addend = periods * stride` overflows int64 bounds
109+
# but not uint64 bounds
110+
dti = date_range(start='1677-09-22', end='2262-04-11', freq='D')
111+
112+
dti2 = date_range(start=dti[0], periods=len(dti), freq='D')
113+
assert dti2.equals(dti)
114+
115+
dti3 = date_range(end=dti[-1], periods=len(dti), freq='D')
116+
assert dti3.equals(dti)
117+
118+
def test_date_range_int64_overflow_non_recoverable(self):
119+
# GH#24255
120+
# case with start later than 1970-01-01, overflow int64 but not uint64
121+
msg = "Cannot generate range with"
122+
with pytest.raises(OutOfBoundsDatetime, match=msg):
123+
date_range(start='1970-02-01', periods=106752 * 24, freq='H')
124+
125+
# case with end before 1970-01-01, overflow int64 but not uint64
126+
with pytest.raises(OutOfBoundsDatetime, match=msg):
127+
date_range(end='1969-11-14', periods=106752 * 24, freq='H')
128+
129+
def test_date_range_int64_overflow_stride_endpoint_different_signs(self):
130+
# cases where stride * periods overflow int64 and stride/endpoint
131+
# have different signs
132+
start = Timestamp('2262-02-23')
133+
end = Timestamp('1969-11-14')
134+
135+
expected = date_range(start=start, end=end, freq='-1H')
136+
assert expected[0] == start
137+
assert expected[-1] == end
138+
139+
dti = date_range(end=end, periods=len(expected), freq='-1H')
140+
tm.assert_index_equal(dti, expected)
141+
142+
start2 = Timestamp('1970-02-01')
143+
end2 = Timestamp('1677-10-22')
144+
145+
expected2 = date_range(start=start2, end=end2, freq='-1H')
146+
assert expected2[0] == start2
147+
assert expected2[-1] == end2
148+
149+
dti2 = date_range(start=start2, periods=len(expected2), freq='-1H')
150+
tm.assert_index_equal(dti2, expected2)
151+
91152
def test_date_range_out_of_bounds(self):
92153
# GH#14187
93154
with pytest.raises(OutOfBoundsDatetime):

0 commit comments

Comments
 (0)