Skip to content

Commit 987d5b0

Browse files
committed
BUG: Properly parse unicode usecols names in CSV
Closes gh-13253.
1 parent 01e7872 commit 987d5b0

File tree

3 files changed

+24
-11
lines changed

3 files changed

+24
-11
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,7 @@ I/O
17051705
^^^
17061706

17071707
- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`)
1708+
- Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`)
17081709
- Bug in :meth:`DataFrame.to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`)
17091710
- Bug in :meth:`DataFrame.to_sql` where a naive :class:`DatetimeIndex` would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`)
17101711
- Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`)

pandas/io/parsers.py

+21-8
Original file line numberDiff line numberDiff line change
@@ -1296,15 +1296,28 @@ def _validate_usecols_arg(usecols):
12961296
if usecols is not None:
12971297
if callable(usecols):
12981298
return usecols, None
1299-
# GH20529, ensure is iterable container but not string.
1300-
elif not is_list_like(usecols):
1299+
1300+
if not is_list_like(usecols):
1301+
# see gh-20529
1302+
#
1303+
# Ensure it is iterable container but not string.
13011304
raise ValueError(msg)
1302-
else:
1303-
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1304-
if usecols_dtype not in ('empty', 'integer',
1305-
'string', 'unicode'):
1306-
raise ValueError(msg)
1307-
return set(usecols), usecols_dtype
1305+
1306+
usecols_dtype = lib.infer_dtype(usecols, skipna=False)
1307+
1308+
if usecols_dtype not in ("empty", "integer",
1309+
"string", "unicode"):
1310+
raise ValueError(msg)
1311+
1312+
usecols = set(usecols)
1313+
1314+
if usecols_dtype == "unicode":
1315+
# see gh-13253
1316+
#
1317+
# Python 2.x compatibility
1318+
usecols = {col.encode("utf-8") for col in usecols}
1319+
1320+
return usecols, usecols_dtype
13081321
return usecols, None
13091322

13101323

pandas/tests/io/parser/test_usecols.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pytest
1010

1111
from pandas._libs.tslib import Timestamp
12-
from pandas.compat import PY2, StringIO
12+
from pandas.compat import StringIO
1313

1414
from pandas import DataFrame, Index
1515
import pandas.util.testing as tm
@@ -387,8 +387,7 @@ def test_usecols_with_mixed_encoding_strings(all_parsers, usecols):
387387

388388
@pytest.mark.parametrize("usecols", [
389389
["あああ", "いい"],
390-
pytest.param([u"あああ", u"いい"], marks=pytest.mark.skipif(
391-
PY2, reason="Buggy behavior: see gh-13253"))
390+
[u"あああ", u"いい"]
392391
])
393392
def test_usecols_with_multi_byte_characters(all_parsers, usecols):
394393
data = """あああ,いい,ううう,ええええ

0 commit comments

Comments
 (0)