Skip to content

Commit 7073dd1

Browse files
authored
BUG: pd.to_datetime(infer_datetime_format=True) drops timezone (#42068)
1 parent e97e103 commit 7073dd1

File tree

5 files changed

+108
-5
lines changed

5 files changed

+108
-5
lines changed

asv_bench/benchmarks/inference.py

+5
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ def setup(self):
173173
self.strings_tz_space = [
174174
x.strftime("%Y-%m-%d %H:%M:%S") + " -0800" for x in rng
175175
]
176+
self.strings_zero_tz = [x.strftime("%Y-%m-%d %H:%M:%S") + "Z" for x in rng]
176177

177178
def time_iso8601(self):
178179
to_datetime(self.strings)
@@ -189,6 +190,10 @@ def time_iso8601_format_no_sep(self):
189190
def time_iso8601_tz_spaceformat(self):
190191
to_datetime(self.strings_tz_space)
191192

193+
def time_iso8601_infer_zero_tz_fromat(self):
194+
# GH 41047
195+
to_datetime(self.strings_zero_tz, infer_datetime_format=True)
196+
192197

193198
class ToDatetimeNONISO8601:
194199
def setup(self):

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ Timedelta
334334

335335
Timezones
336336
^^^^^^^^^
337+
- Bug in :func:`to_datetime` with ``infer_datetime_format=True`` failing to parse zero UTC offset (``Z``) correctly (:issue:`41047`)
337338
- Bug in :meth:`Series.dt.tz_convert` resetting index in a :class:`Series` with :class:`CategoricalIndex` (:issue:`43080`)
338339
-
339340

pandas/_libs/tslibs/parsing.pyx

+35-5
Original file line numberDiff line numberDiff line change
@@ -845,15 +845,17 @@ def format_is_iso(f: str) -> bint:
845845
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
846846
but must be consistent. Leading 0s in dates and times are optional.
847847
"""
848-
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S.%f'.format
848+
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
849849
excluded_formats = ['%Y%m%d', '%Y%m', '%Y']
850850

851851
for date_sep in [' ', '/', '\\', '-', '.', '']:
852852
for time_sep in [' ', 'T']:
853-
if (iso_template(date_sep=date_sep,
854-
time_sep=time_sep
855-
).startswith(f) and f not in excluded_formats):
856-
return True
853+
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
854+
if (iso_template(date_sep=date_sep,
855+
time_sep=time_sep,
856+
micro_or_tz=micro_or_tz,
857+
).startswith(f) and f not in excluded_formats):
858+
return True
857859
return False
858860

859861

@@ -907,6 +909,7 @@ def guess_datetime_format(
907909
(('second',), '%S', 2),
908910
(('microsecond',), '%f', 6),
909911
(('second', 'microsecond'), '%S.%f', 0),
912+
(('tzinfo',), '%z', 0),
910913
(('tzinfo',), '%Z', 0),
911914
]
912915

@@ -927,6 +930,33 @@ def guess_datetime_format(
927930
# that any user-provided function will not either.
928931
tokens = dt_str_split(dt_str)
929932

933+
# Normalize offset part of tokens.
934+
# There are multiple formats for the timezone offset.
935+
# To pass the comparison condition between the output of `strftime` and
936+
# joined tokens, which is carried out at the final step of the function,
937+
# the offset part of the tokens must match the '%z' format like '+0900'
938+
# instead of ‘+09:00’.
939+
if parsed_datetime.tzinfo is not None:
940+
offset_index = None
941+
if len(tokens) > 0 and tokens[-1] == 'Z':
942+
# the last 'Z' means zero offset
943+
offset_index = -1
944+
elif len(tokens) > 1 and tokens[-2] in ('+', '-'):
945+
# ex. [..., '+', '0900']
946+
offset_index = -2
947+
elif len(tokens) > 3 and tokens[-4] in ('+', '-'):
948+
# ex. [..., '+', '09', ':', '00']
949+
offset_index = -4
950+
951+
if offset_index is not None:
952+
# If the input string has a timezone offset like '+0900',
953+
# the offset is separated into two tokens, ex. ['+', '0900’].
954+
# This separation will prevent subsequent processing
955+
# from correctly parsing the time zone format.
956+
# So in addition to the format nomalization, we rejoin them here.
957+
tokens[offset_index] = parsed_datetime.strftime("%z")
958+
tokens = tokens[:offset_index + 1 or None]
959+
930960
format_guess = [None] * len(tokens)
931961
found_attrs = set()
932962

pandas/tests/tools/test_to_datetime.py

+17
Original file line numberDiff line numberDiff line change
@@ -2032,6 +2032,23 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset):
20322032
)
20332033
tm.assert_series_equal(result, expected)
20342034

2035+
@pytest.mark.parametrize(
2036+
"ts,zero_tz,is_utc",
2037+
[
2038+
("2019-02-02 08:07:13", "Z", True),
2039+
("2019-02-02 08:07:13", "", False),
2040+
("2019-02-02 08:07:13.012345", "Z", True),
2041+
("2019-02-02 08:07:13.012345", "", False),
2042+
],
2043+
)
2044+
def test_infer_datetime_format_zero_tz(self, ts, zero_tz, is_utc):
2045+
# GH 41047
2046+
s = Series([ts + zero_tz])
2047+
result = to_datetime(s, infer_datetime_format=True)
2048+
tz = pytz.utc if is_utc else None
2049+
expected = Series([Timestamp(ts, tz=tz)])
2050+
tm.assert_series_equal(result, expected)
2051+
20352052
@pytest.mark.parametrize("cache", [True, False])
20362053
def test_to_datetime_iso8601_noleading_0s(self, cache):
20372054
# GH 11871

pandas/tests/tslibs/test_parsing.py

+50
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,28 @@ def test_parsers_month_freq(date_str, expected):
144144
("30-12-2011", "%d-%m-%Y"),
145145
("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
146146
("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
147+
("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
148+
("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
149+
("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
150+
("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
151+
("2011-12-30T00:00:00+090", None),
152+
("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
153+
("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
154+
("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
155+
("2011-12-30T00:00:00+09:000", "%Y-%m-%dT%H:%M:%S%z"),
156+
("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
157+
("2011-12-30T00:00:00+09:", None),
158+
("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
159+
("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
160+
("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
161+
("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
162+
("2011-12-30T00:00:00.000000+090", None),
163+
("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
164+
("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
165+
("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
166+
("2011-12-30T00:00:00.000000+09:000", "%Y-%m-%dT%H:%M:%S.%f%z"),
167+
("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
168+
("2011-12-30T00:00:00.000000+09:", None),
147169
("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
148170
],
149171
)
@@ -226,3 +248,31 @@ def test_parse_time_string_check_instance_type_raise_exception():
226248
result = parse_time_string("2019")
227249
expected = (datetime(2019, 1, 1), "year")
228250
assert result == expected
251+
252+
253+
@pytest.mark.parametrize(
254+
"fmt,expected",
255+
[
256+
("%Y %m %d %H:%M:%S", True),
257+
("%Y/%m/%d %H:%M:%S", True),
258+
(r"%Y\%m\%d %H:%M:%S", True),
259+
("%Y-%m-%d %H:%M:%S", True),
260+
("%Y.%m.%d %H:%M:%S", True),
261+
("%Y%m%d %H:%M:%S", True),
262+
("%Y-%m-%dT%H:%M:%S", True),
263+
("%Y-%m-%dT%H:%M:%S%z", True),
264+
("%Y-%m-%dT%H:%M:%S%Z", True),
265+
("%Y-%m-%dT%H:%M:%S.%f", True),
266+
("%Y-%m-%dT%H:%M:%S.%f%z", True),
267+
("%Y-%m-%dT%H:%M:%S.%f%Z", True),
268+
("%Y%m%d", False),
269+
("%Y%m", False),
270+
("%Y", False),
271+
("%Y-%m-%d", True),
272+
("%Y-%m", True),
273+
],
274+
)
275+
def test_is_iso_format(fmt, expected):
276+
# see gh-41047
277+
result = parsing.format_is_iso(fmt)
278+
assert result == expected

0 commit comments

Comments
 (0)