Skip to content

Commit 90c66c5

Browse files
Sam CohanTomAugspurger
Sam Cohan
authored andcommitted
Read csv category fix (#18402)
(cherry picked from commit d421a09)
1 parent 0212a25 commit 90c66c5

File tree

3 files changed

+16
-3
lines changed

3 files changed

+16
-3
lines changed

doc/source/whatsnew/v0.21.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ I/O
8282
- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
8383
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
8484
- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
85+
- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`)
8586
- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
8687
- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
8788
- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)

pandas/_libs/parsers.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -2221,9 +2221,10 @@ def _concatenate_chunks(list chunks):
22212221
for name in names:
22222222
arrs = [chunk.pop(name) for chunk in chunks]
22232223
# Check each arr for consistent types.
2224-
dtypes = set([a.dtype for a in arrs])
2225-
if len(dtypes) > 1:
2226-
common_type = np.find_common_type(dtypes, [])
2224+
dtypes = {a.dtype for a in arrs}
2225+
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
2226+
if len(numpy_dtypes) > 1:
2227+
common_type = np.find_common_type(numpy_dtypes, [])
22272228
if common_type == np.object:
22282229
warning_columns.append(str(name))
22292230

pandas/tests/io/parser/dtypes.py

+11
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,17 @@ def test_categorical_dtype(self):
114114
actual = self.read_csv(StringIO(data), dtype='category')
115115
tm.assert_frame_equal(actual, expected)
116116

117+
@pytest.mark.slow
118+
def test_categorical_dtype_high_cardinality_numeric(self):
119+
# GH 18186
120+
data = np.sort([str(i) for i in range(524289)])
121+
expected = DataFrame({'a': Categorical(data, ordered=True)})
122+
actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
123+
dtype='category')
124+
actual["a"] = actual["a"].cat.reorder_categories(
125+
np.sort(actual.a.cat.categories), ordered=True)
126+
tm.assert_frame_equal(actual, expected)
127+
117128
def test_categorical_dtype_encoding(self):
118129
# GH 10153
119130
pth = tm.get_data_path('unicode_series.csv')

0 commit comments

Comments
 (0)