Skip to content

Commit 4f36b02

Browse files
authored
ENH: Add support for more placeholders in guess_datetime_format (#43901) (#43900)
Add support for day of week and meridiem placeholders and any combination of placeholders supported by `strftime` that do not correspond to a datetime attribute.
1 parent 2224cf3 commit 4f36b02

File tree

4 files changed

+27
-9
lines changed

4 files changed

+27
-9
lines changed

asv_bench/benchmarks/inference.py

+10
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,16 @@ def time_dup_string_tzoffset_dates(self, cache):
277277
to_datetime(self.dup_string_with_tz, cache=cache)
278278

279279

280+
# GH 43901
281+
class ToDatetimeInferDatetimeFormat:
282+
def setup(self):
283+
rng = date_range(start="1/1/2000", periods=100000, freq="H")
284+
self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist()
285+
286+
def time_infer_datetime_format(self):
287+
to_datetime(self.strings, infer_datetime_format=True)
288+
289+
280290
class ToTimedelta:
281291
def setup(self):
282292
self.ints = np.random.randint(0, 60, size=10000)

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ Performance improvements
407407
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
408408
- Performance improvement in :func:`read_stata` (:issue:`43059`)
409409
- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)
410+
- Performance improvement in :meth:`to_datetime` with ``infer_datetime_format`` set to ``True`` (:issue:`43901`)
410411
- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
411412
- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`)
412413
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`43469`)

pandas/_libs/tslibs/parsing.pyx

+14-9
Original file line numberDiff line numberDiff line change
@@ -912,6 +912,9 @@ def guess_datetime_format(
912912
(('second', 'microsecond'), '%S.%f', 0),
913913
(('tzinfo',), '%z', 0),
914914
(('tzinfo',), '%Z', 0),
915+
(('day_of_week',), '%a', 0),
916+
(('day_of_week',), '%A', 0),
917+
(('meridiem',), '%p', 0),
915918
]
916919

917920
if dayfirst:
@@ -968,15 +971,17 @@ def guess_datetime_format(
968971
if set(attrs) & found_attrs:
969972
continue
970973

971-
if all(getattr(parsed_datetime, attr) is not None for attr in attrs):
972-
for i, token_format in enumerate(format_guess):
973-
token_filled = tokens[i].zfill(padding)
974-
if (token_format is None and
975-
token_filled == parsed_datetime.strftime(attr_format)):
976-
format_guess[i] = attr_format
977-
tokens[i] = token_filled
978-
found_attrs.update(attrs)
979-
break
974+
if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"):
975+
continue
976+
977+
parsed_formatted = parsed_datetime.strftime(attr_format)
978+
for i, token_format in enumerate(format_guess):
979+
token_filled = tokens[i].zfill(padding)
980+
if token_format is None and token_filled == parsed_formatted:
981+
format_guess[i] = attr_format
982+
tokens[i] = token_filled
983+
found_attrs.update(attrs)
984+
break
980985

981986
# Only consider it a valid guess if we have a year, month and day
982987
if len({'year', 'month', 'day'} & found_attrs) != 3:

pandas/tests/tslibs/test_parsing.py

+2
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ def test_parsers_month_freq(date_str, expected):
167167
("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
168168
("2011-12-30T00:00:00.000000+09:", None),
169169
("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
170+
("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"),
171+
("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %H:%M:%S %p"),
170172
],
171173
)
172174
def test_guess_datetime_format_with_parseable_formats(string, fmt):

0 commit comments

Comments
 (0)