Read csv category fix (#18402)

Sam Cohan · jreback · commit d421a09e3821 · 2017-11-22T06:28:41.000-05:00
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -82,6 +82,7 @@ I/O
 - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
 - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
 - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
+- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`)
 - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
 - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
 - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -2227,9 +2227,10 @@ def _concatenate_chunks(list chunks):
     for name in names:
         arrs = [chunk.pop(name) for chunk in chunks]
         # Check each arr for consistent types.
-        dtypes = set(a.dtype for a in arrs)
-        if len(dtypes) > 1:
-            common_type = np.find_common_type(dtypes, [])
+        dtypes = {a.dtype for a in arrs}
+        numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
+        if len(numpy_dtypes) > 1:
+            common_type = np.find_common_type(numpy_dtypes, [])
             if common_type == np.object:
                 warning_columns.append(str(name))
 
diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -114,6 +114,17 @@ def test_categorical_dtype(self):
         actual = self.read_csv(StringIO(data), dtype='category')
         tm.assert_frame_equal(actual, expected)
 
+    @pytest.mark.slow
+    def test_categorical_dtype_high_cardinality_numeric(self):
+        # GH 18186
+        data = np.sort([str(i) for i in range(524289)])
+        expected = DataFrame({'a': Categorical(data, ordered=True)})
+        actual = self.read_csv(StringIO('a\n' + '\n'.join(data)),
+                               dtype='category')
+        actual["a"] = actual["a"].cat.reorder_categories(
+            np.sort(actual.a.cat.categories), ordered=True)
+        tm.assert_frame_equal(actual, expected)
+
     def test_categorical_dtype_encoding(self):
         # GH 10153
         pth = tm.get_data_path('unicode_series.csv')