Skip to content

Commit 595ae8c

Browse files
authored
ENH: parse 8 or 9 digit delimited dates (pandas-dev#47894)
parse 8 or 9 digit delimited dates
1 parent 72da7b2 commit 595ae8c

File tree

3 files changed

+66
-7
lines changed

3 files changed

+66
-7
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,7 @@ Other Deprecations
770770
- Deprecated the argument ``na_sentinel`` in :func:`factorize`, :meth:`Index.factorize`, and :meth:`.ExtensionArray.factorize`; pass ``use_na_sentinel=True`` instead to use the sentinel ``-1`` for NaN values and ``use_na_sentinel=False`` instead of ``na_sentinel=None`` to encode NaN values (:issue:`46910`)
771771
- Deprecated :meth:`DataFrameGroupBy.transform` not aligning the result when the UDF returned DataFrame (:issue:`45648`)
772772
- Clarified warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument (:issue:`46210`)
773+
- Emit warning from :func:`to_datetime` when delimited dates can't be parsed in accordance to specified ``dayfirst`` argument even for dates where leading zero is omitted (e.g. ``31/1/2001``) (:issue:`47880`)
773774
- Deprecated :class:`Series` and :class:`Resampler` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) raising a ``NotImplementedError`` when the dtype is non-numric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
774775
- Deprecated :meth:`Series.rank` returning an empty result when the dtype is non-numeric and ``numeric_only=True`` is provided; this will raise a ``TypeError`` in a future version (:issue:`47500`)
775776
- Deprecated argument ``errors`` for :meth:`Series.mask`, :meth:`Series.where`, :meth:`DataFrame.mask`, and :meth:`DataFrame.where` as ``errors`` had no effect on this methods (:issue:`47728`)

pandas/_libs/tslibs/parsing.pyx

+32-7
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,14 @@ cdef:
9999
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
100100

101101

102-
cdef inline bint _is_not_delimiter(const char ch):
103-
return strchr(delimiters, ch) == NULL
102+
cdef inline bint _is_delimiter(const char ch):
103+
return strchr(delimiters, ch) != NULL
104+
105+
106+
cdef inline int _parse_1digit(const char* s):
107+
cdef int result = 0
108+
result += getdigit_ascii(s[0], -10) * 1
109+
return result
104110

105111

106112
cdef inline int _parse_2digit(const char* s):
@@ -151,18 +157,37 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
151157
bint can_swap = 0
152158

153159
buf = get_c_string_buf_and_size(date_string, &length)
154-
if length == 10:
160+
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
155161
# parsing MM?DD?YYYY and DD?MM?YYYY dates
156-
if _is_not_delimiter(buf[2]) or _is_not_delimiter(buf[5]):
157-
return None, None
158162
month = _parse_2digit(buf)
159163
day = _parse_2digit(buf + 3)
160164
year = _parse_4digit(buf + 6)
161165
reso = 'day'
162166
can_swap = 1
163-
elif length == 7:
167+
elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
168+
# parsing M?DD?YYYY and D?MM?YYYY dates
169+
month = _parse_1digit(buf)
170+
day = _parse_2digit(buf + 2)
171+
year = _parse_4digit(buf + 5)
172+
reso = 'day'
173+
can_swap = 1
174+
elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
175+
# parsing MM?D?YYYY and DD?M?YYYY dates
176+
month = _parse_2digit(buf)
177+
day = _parse_1digit(buf + 3)
178+
year = _parse_4digit(buf + 5)
179+
reso = 'day'
180+
can_swap = 1
181+
elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
182+
# parsing M?D?YYYY and D?M?YYYY dates
183+
month = _parse_1digit(buf)
184+
day = _parse_1digit(buf + 2)
185+
year = _parse_4digit(buf + 4)
186+
reso = 'day'
187+
can_swap = 1
188+
elif length == 7 and _is_delimiter(buf[2]):
164189
# parsing MM?YYYY dates
165-
if buf[2] == b'.' or _is_not_delimiter(buf[2]):
190+
if buf[2] == b'.':
166191
# we cannot reliably tell whether e.g. 10.2010 is a float
167192
# or a date, thus we refuse to parse it here
168193
return None, None

pandas/tests/io/parser/test_parse_dates.py

+33
Original file line numberDiff line numberDiff line change
@@ -1948,6 +1948,39 @@ def test_dayfirst_warnings():
19481948
tm.assert_index_equal(expected, res8)
19491949

19501950

1951+
@pytest.mark.parametrize(
1952+
"date_string, dayfirst",
1953+
[
1954+
pytest.param(
1955+
"31/1/2014",
1956+
False,
1957+
id="second date is single-digit",
1958+
),
1959+
pytest.param(
1960+
"1/31/2014",
1961+
True,
1962+
id="first date is single-digit",
1963+
),
1964+
],
1965+
)
1966+
def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst):
1967+
# GH47880
1968+
initial_value = f"date\n{date_string}"
1969+
expected = DatetimeIndex(
1970+
["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date"
1971+
)
1972+
with tm.assert_produces_warning(
1973+
UserWarning, match=r"may lead to inconsistently parsed dates"
1974+
):
1975+
res = read_csv(
1976+
StringIO(initial_value),
1977+
parse_dates=["date"],
1978+
index_col="date",
1979+
dayfirst=dayfirst,
1980+
).index
1981+
tm.assert_index_equal(expected, res)
1982+
1983+
19511984
@skip_pyarrow
19521985
def test_infer_first_column_as_index(all_parsers):
19531986
# GH#11019

0 commit comments

Comments
 (0)