-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
API (string dtype): implement hierarchy (NA > NaN, pyarrow > python) for consistent comparisons between different string dtypes #61138
base: main
Are you sure you want to change the base?
Changes from 5 commits
3c4d782
7ffb08f
48907c3
2058120
4ebd93b
33db5d0
51340a9
e2bfe18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
|
||
from pandas._config import using_string_dtype | ||
|
||
from pandas.compat import HAS_PYARROW | ||
from pandas.compat.pyarrow import ( | ||
pa_version_under12p0, | ||
pa_version_under19p0, | ||
|
@@ -45,6 +46,25 @@ def cls(dtype): | |
return dtype.construct_array_type() | ||
|
||
|
||
def string_dtype_highest_priority(dtype1, dtype2): | ||
if HAS_PYARROW: | ||
DTYPE_HIERARCHY = [ | ||
pd.StringDtype("python", na_value=np.nan), | ||
pd.StringDtype("pyarrow", na_value=np.nan), | ||
pd.StringDtype("python", na_value=pd.NA), | ||
pd.StringDtype("pyarrow", na_value=pd.NA), | ||
] | ||
else: | ||
DTYPE_HIERARCHY = [ | ||
pd.StringDtype("python", na_value=np.nan), | ||
pd.StringDtype("python", na_value=pd.NA), | ||
] | ||
|
||
h1 = DTYPE_HIERARCHY.index(dtype1) | ||
h2 = DTYPE_HIERARCHY.index(dtype2) | ||
return DTYPE_HIERARCHY[max(h1, h2)] | ||
|
||
|
||
def test_dtype_constructor(): | ||
pytest.importorskip("pyarrow") | ||
|
||
|
@@ -319,37 +339,41 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): | |
tm.assert_extension_array_equal(result, expected) | ||
|
||
|
||
def test_comparison_methods_array(comparison_op, dtype): | ||
def test_comparison_methods_array(comparison_op, dtype, dtype2): | ||
op_name = f"__{comparison_op.__name__}__" | ||
|
||
a = pd.array(["a", None, "c"], dtype=dtype) | ||
other = [None, None, "c"] | ||
result = getattr(a, op_name)(other) | ||
if dtype.na_value is np.nan: | ||
other = pd.array([None, None, "c"], dtype=dtype2) | ||
result = comparison_op(a, other) | ||
|
||
# ensure operation is commutative | ||
result2 = comparison_op(other, a) | ||
tm.assert_equal(result, result2) | ||
|
||
if dtype.na_value is np.nan and dtype2.na_value is np.nan: | ||
if operator.ne == comparison_op: | ||
expected = np.array([True, True, False]) | ||
else: | ||
expected = np.array([False, False, False]) | ||
expected[-1] = getattr(other[-1], op_name)(a[-1]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
result = getattr(a, op_name)(pd.NA) | ||
if operator.ne == comparison_op: | ||
expected = np.array([True, True, True]) | ||
else: | ||
max_dtype = string_dtype_highest_priority(dtype, dtype2) | ||
if max_dtype.storage == "python": | ||
expected_dtype = "boolean" | ||
else: | ||
expected = np.array([False, False, False]) | ||
tm.assert_numpy_array_equal(result, expected) | ||
expected_dtype = "bool[pyarrow]" | ||
|
||
else: | ||
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" | ||
expected = np.full(len(a), fill_value=None, dtype="object") | ||
expected[-1] = getattr(other[-1], op_name)(a[-1]) | ||
expected = pd.array(expected, dtype=expected_dtype) | ||
tm.assert_extension_array_equal(result, expected) | ||
|
||
result = getattr(a, op_name)(pd.NA) | ||
expected = pd.array([None, None, None], dtype=expected_dtype) | ||
tm.assert_extension_array_equal(result, expected) | ||
# # with list | ||
# other = [None, None, "c"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you want to implement testing this in this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this was already implemented, just need to add this case back to the test. The original "array" test was actually testing with a list. I updated the test to now actually use an array (parametrized with all the different dtypes, to get all combinations of dtypes in both operands), and added a separate test with just the list. |
||
# result3 = getattr(a, op_name)(other) | ||
# tm.assert_equal(result, result3) | ||
|
||
|
||
def test_constructor_raises(cls): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
from pandas.api.types import is_string_dtype | ||
from pandas.core.arrays import ArrowStringArray | ||
from pandas.core.arrays.string_ import StringDtype | ||
from pandas.tests.arrays.string_.test_string import string_dtype_highest_priority | ||
from pandas.tests.extension import base | ||
|
||
|
||
|
@@ -202,10 +203,13 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): | |
dtype = cast(StringDtype, tm.get_dtype(obj)) | ||
if op_name in ["__add__", "__radd__"]: | ||
cast_to = dtype | ||
dtype_other = tm.get_dtype(other) if not isinstance(other, str) else None | ||
if isinstance(dtype_other, StringDtype): | ||
cast_to = string_dtype_highest_priority(dtype, dtype_other) | ||
elif dtype.na_value is np.nan: | ||
cast_to = np.bool_ # type: ignore[assignment] | ||
elif dtype.storage == "pyarrow": | ||
cast_to = "boolean[pyarrow]" # type: ignore[assignment] | ||
cast_to = "bool[pyarrow]" # type: ignore[assignment] | ||
else: | ||
cast_to = "boolean" # type: ignore[assignment] | ||
return pointwise_result.astype(cast_to) | ||
|
@@ -237,9 +241,11 @@ def test_arith_series_with_array( | |
using_infer_string | ||
and all_arithmetic_operators == "__radd__" | ||
and ( | ||
(dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW) | ||
dtype.na_value is pd.NA | ||
and not (not HAS_PYARROW and dtype.storage == "python") | ||
) | ||
Comment on lines
241
to
246
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: This would be a bit cleaner as
|
||
): | ||
# TODO(infer_string) | ||
mark = pytest.mark.xfail( | ||
reason="The pointwise operation result will be inferred to " | ||
"string[nan, pyarrow], which does not match the input dtype" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this case of comparing with NA, we already have a dedicated test just above, so removing it here