Backport PR #55691 on branch 2.1.x (REGR: fix read_parquet with column of large strings (avoid overflow from concat)) (#55706)

meeseeksmachine · jorisvandenbossche · web-flow · commit 16a3b01647c5 · 2023-10-26T12:05:45.000-04:00
Backport PR #55691: REGR: fix read_parquet with column of large strings (avoid overflow from concat) Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -28,6 +28,7 @@ Fixed regressions
 - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`)
 - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`)
 - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`)
+- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`)
 - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`)
 - Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`)
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -223,11 +223,19 @@ def __from_arrow__(
                 # pyarrow.ChunkedArray
                 chunks = array.chunks
 
+            results = []
+            for arr in chunks:
+                # convert chunk by chunk to numpy and concatenate then, to avoid
+                # overflow for large string data when concatenating the pyarrow arrays
+                arr = arr.to_numpy(zero_copy_only=False)
+                arr = ensure_string_array(arr, na_value=libmissing.NA)
+                results.append(arr)
+
         if len(chunks) == 0:
             arr = np.array([], dtype=object)
         else:
-            arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
-            arr = ensure_string_array(arr, na_value=libmissing.NA)
+            arr = np.concatenate(results)
+
         # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1144,6 +1144,18 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
         )
         tm.assert_frame_equal(result, expected)
 
+    # NOTE: this test is not run by default, because it requires a lot of memory (>5GB)
+    # @pytest.mark.slow
+    # def test_string_column_above_2GB(self, tmp_path, pa):
+    #     # https://github.com/pandas-dev/pandas/issues/55606
+    #     # above 2GB of string data
+    #     v1 = b"x" * 100000000
+    #     v2 = b"x" * 147483646
+    #     df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string")
+    #     df.to_parquet(tmp_path / "test.parquet")
+    #     result = read_parquet(tmp_path / "test.parquet")
+    #     assert result["strings"].dtype == "string"
+
 
 class TestParquetFastParquet(Base):
     def test_basic(self, fp, df_full):