-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG/API: to_datetime preserves UTC offsets when parsing datetime strings #21822
Changes from all commits
ac5a3d1
b81a8e9
6bf46a8
678b337
1917148
581a33e
a1bc8f9
80042e6
7c4135e
0651416
bacb6e3
a2f4aad
d48f341
7efb25c
1d527ff
f89d6b6
d91c63f
1054e8b
d9cdc91
7d04613
031284c
749e62e
23cbf75
1e6f87a
7539bcf
c1f51cd
db75a24
4733ac5
e3db735
2fa681f
5f36c98
d7ff275
4ff7cb3
8463d91
dddc6b3
cca3983
e441be0
a8a65f7
75f9fd9
6052475
f916c69
807a251
1cbd9b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ import numpy as np | |
cnp.import_array() | ||
|
||
import pytz | ||
from dateutil.tz import tzlocal, tzutc as dateutil_utc | ||
|
||
|
||
from util cimport (is_integer_object, is_float_object, is_string_object, | ||
|
@@ -328,7 +329,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): | |
if unit == 'ns': | ||
if issubclass(values.dtype.type, np.integer): | ||
return values.astype('M8[ns]') | ||
return array_to_datetime(values.astype(object), errors=errors) | ||
return array_to_datetime(values.astype(object), errors=errors)[0] | ||
|
||
m = cast_from_unit(None, unit) | ||
|
||
|
@@ -457,21 +458,58 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
dayfirst=False, yearfirst=False, | ||
format=None, utc=None, | ||
require_iso8601=False): | ||
""" | ||
Converts a 1D array of date-like values to a numpy array of either: | ||
1) datetime64[ns] data | ||
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError | ||
is encountered | ||
|
||
Also returns a pytz.FixedOffset if an array of strings with the same | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In principle other tzinfos could be returned, specifically if it falls back to dateutil There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is specifying that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh right that's true, good catch. Sure I will try to add some tests and functionality to hit the dateutil parser before the object branch. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So I am using a So instead, I am saving the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add Parameters here |
||
timezone offset is passed and utc=True is not passed. Otherwise, None | ||
is returned | ||
|
||
Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric, | ||
strings | ||
|
||
Parameters | ||
---------- | ||
values : ndarray of object | ||
date-like objects to convert | ||
errors : str, default 'raise' | ||
error behavior when parsing | ||
dayfirst : bool, default False | ||
dayfirst parsing behavior when encountering datetime strings | ||
yearfirst : bool, default False | ||
yearfirst parsing behavior when encountering datetime strings | ||
format : str, default None | ||
format of the string to parse | ||
utc : bool, default None | ||
indicator whether the dates should be UTC | ||
require_iso8601 : bool, default False | ||
indicator whether the datetime string should be iso8601 | ||
|
||
Returns | ||
------- | ||
tuple (ndarray, tzoffset) | ||
""" | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
object val, py_dt | ||
object val, py_dt, tz, tz_out = None | ||
ndarray[int64_t] iresult | ||
ndarray[object] oresult | ||
npy_datetimestruct dts | ||
bint utc_convert = bool(utc) | ||
bint seen_integer = 0 | ||
bint seen_string = 0 | ||
bint seen_datetime = 0 | ||
bint seen_datetime_offset = 0 | ||
bint is_raise = errors=='raise' | ||
bint is_ignore = errors=='ignore' | ||
bint is_coerce = errors=='coerce' | ||
_TSObject _ts | ||
int out_local=0, out_tzoffset=0 | ||
float offset_seconds | ||
set out_tzoffset_vals = set() | ||
|
||
# specify error conditions | ||
assert is_raise or is_ignore or is_coerce | ||
|
@@ -584,7 +622,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't match " | ||
"format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
|
||
try: | ||
py_dt = parse_datetime_string(val, dayfirst=dayfirst, | ||
|
@@ -595,6 +633,30 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
continue | ||
raise TypeError("invalid string coercion to datetime") | ||
|
||
# If the dateutil parser returned tzinfo, capture it | ||
# to check if all arguments have the same tzinfo | ||
tz = py_dt.tzinfo | ||
if tz is not None: | ||
seen_datetime_offset = 1 | ||
if tz == dateutil_utc(): | ||
# dateutil.tz.tzutc has no offset-like attribute | ||
# Just add the 0 offset explicitly | ||
out_tzoffset_vals.add(0) | ||
elif tz == tzlocal(): | ||
# is comparison fails unlike other dateutil.tz | ||
# objects. Also, dateutil.tz.tzlocal has no | ||
# _offset attribute like tzoffset | ||
offset_seconds = tz._dst_offset.total_seconds() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that this is using an internal method and is fragile. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. Suggestion for a better way to do this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand what this function is supposed to do, but from what I do understand it looks like a mistake. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My original intent with this section was to collect the dateutil timezone objects in a However since dateutil timezones cannot be hashed (thanks for starting progress on the associated issue btw @pganssle), I instead store the timezone's UTC offsets in seconds instead (which is what I essentially care about). It wasn't too apparent to me if there was a public way to access a dateutil timezone's UTC offset, hence why I am use a private method here. Definitely open to a better method. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think if you can't find a public method you probably shouldn't dive into the private API. There are a few assumptions in here that won't survive future versions of If you want the offset, you should do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are significantly worried about speed, you can probably use the fact that in |
||
out_tzoffset_vals.add(offset_seconds) | ||
else: | ||
# dateutil.tz.tzoffset objects cannot be hashed | ||
# store the total_seconds() instead | ||
offset_seconds = tz._offset.total_seconds() | ||
out_tzoffset_vals.add(offset_seconds) | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add('naive') | ||
try: | ||
_ts = convert_datetime_to_tsobject(py_dt, None) | ||
iresult[i] = _ts.value | ||
|
@@ -614,8 +676,17 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
# where we left off | ||
value = dtstruct_to_dt64(&dts) | ||
if out_local == 1: | ||
seen_datetime_offset = 1 | ||
# Store the out_tzoffset in seconds | ||
# since we store the total_seconds of | ||
# dateutil.tz.tzoffset objects | ||
out_tzoffset_vals.add(out_tzoffset * 60.) | ||
tz = pytz.FixedOffset(out_tzoffset) | ||
value = tz_convert_single(value, tz, 'UTC') | ||
else: | ||
# Add a marker for naive string, to track if we are | ||
# parsing mixed naive and aware strings | ||
out_tzoffset_vals.add('naive') | ||
iresult[i] = value | ||
try: | ||
check_dts_bounds(&dts) | ||
|
@@ -631,7 +702,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
raise ValueError("time data {val} doesn't " | ||
"match format specified" | ||
.format(val=val)) | ||
return values | ||
return values, tz_out | ||
raise | ||
|
||
else: | ||
|
@@ -657,7 +728,21 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
else: | ||
raise TypeError | ||
|
||
return result | ||
if seen_datetime_offset and not utc_convert: | ||
# GH 17697 | ||
# 1) If all the offsets are equal, return one offset for | ||
# the parsed dates to (maybe) pass to DatetimeIndex | ||
# 2) If the offsets are different, then force the parsing down the | ||
# object path where an array of datetimes | ||
# (with individual dateutil.tzoffsets) are returned | ||
is_same_offsets = len(out_tzoffset_vals) == 1 | ||
if not is_same_offsets: | ||
return array_to_datetime_object(values, is_raise, | ||
dayfirst, yearfirst) | ||
else: | ||
tz_offset = out_tzoffset_vals.pop() | ||
tz_out = pytz.FixedOffset(tz_offset / 60.) | ||
return result, tz_out | ||
except OutOfBoundsDatetime: | ||
if is_raise: | ||
raise | ||
|
@@ -679,36 +764,67 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', | |
oresult[i] = val.item() | ||
else: | ||
oresult[i] = val | ||
return oresult | ||
return oresult, tz_out | ||
except TypeError: | ||
oresult = np.empty(n, dtype=object) | ||
return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) | ||
|
||
for i in range(n): | ||
val = values[i] | ||
if checknull_with_nat(val): | ||
oresult[i] = val | ||
elif is_string_object(val): | ||
|
||
if len(val) == 0 or val in nat_strings: | ||
oresult[i] = 'NaT' | ||
continue | ||
cdef array_to_datetime_object(ndarray[object] values, bint is_raise, | ||
dayfirst=False, yearfirst=False): | ||
""" | ||
Fall back function for array_to_datetime | ||
|
||
try: | ||
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, | ||
yearfirst=yearfirst) | ||
pydatetime_to_dt64(oresult[i], &dts) | ||
check_dts_bounds(&dts) | ||
except Exception: | ||
if is_raise: | ||
raise | ||
return values | ||
# oresult[i] = val | ||
else: | ||
Attempts to parse datetime strings with dateutil to return an array | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add Parameters |
||
of datetime objects | ||
|
||
Parameters | ||
---------- | ||
values : ndarray of object | ||
date-like objects to convert | ||
is_raise : bool | ||
error behavior when parsing | ||
dayfirst : bool, default False | ||
dayfirst parsing behavior when encountering datetime strings | ||
yearfirst : bool, default False | ||
yearfirst parsing behavior when encountering datetime strings | ||
|
||
Returns | ||
------- | ||
tuple (ndarray, None) | ||
""" | ||
cdef: | ||
Py_ssize_t i, n = len(values) | ||
object val, | ||
ndarray[object] oresult | ||
npy_datetimestruct dts | ||
|
||
oresult = np.empty(n, dtype=object) | ||
|
||
# We return an object array and only attempt to parse: | ||
# 1) NaT or NaT-like values | ||
# 2) datetime strings, which we return as datetime.datetime | ||
for i in range(n): | ||
val = values[i] | ||
if checknull_with_nat(val): | ||
oresult[i] = val | ||
elif is_string_object(val): | ||
if len(val) == 0 or val in nat_strings: | ||
oresult[i] = 'NaT' | ||
continue | ||
try: | ||
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, | ||
yearfirst=yearfirst) | ||
pydatetime_to_dt64(oresult[i], &dts) | ||
check_dts_bounds(&dts) | ||
except (ValueError, OverflowError): | ||
if is_raise: | ||
raise | ||
return values | ||
|
||
return oresult | ||
return values, None | ||
else: | ||
if is_raise: | ||
raise | ||
return values, None | ||
return oresult, None | ||
|
||
|
||
cdef inline bint _parse_today_now(str val, int64_t* iresult): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this directiive you just be python (rather than ipython here) @TomAugspurger ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at previous whatsnews, looks like either ipython or python works for this directive.