Skip to content

Commit 581bf70

Browse files
authored
BUG: rank_2d raising with mixed dtypes (#38932)
1 parent 26fa853 commit 581bf70

File tree

3 files changed

+33
-40
lines changed

3 files changed

+33
-40
lines changed

doc/source/whatsnew/v1.3.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,8 @@ Numeric
219219
- Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
220220
- Bug in :meth:`DataFrame.mode` and :meth:`Series.mode` not keeping consistent integer :class:`Index` for empty input (:issue:`33321`)
221221
- Bug in :meth:`DataFrame.rank` with ``np.inf`` and mixture of ``np.nan`` and ``np.inf`` (:issue:`32593`)
222+
- Bug in :meth:`DataFrame.rank` with ``axis=0`` and columns holding incomparable types raising ``IndexError`` (:issue:`38932`)
223+
-
222224

223225
Conversion
224226
^^^^^^^^^^

pandas/_libs/algos.pyx

+19-40
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ from numpy cimport (
2626
int16_t,
2727
int32_t,
2828
int64_t,
29+
intp_t,
2930
ndarray,
3031
uint8_t,
3132
uint16_t,
@@ -1105,14 +1106,13 @@ def rank_2d(
11051106
Py_ssize_t infs
11061107
ndarray[float64_t, ndim=2] ranks
11071108
ndarray[rank_t, ndim=2] values
1108-
ndarray[int64_t, ndim=2] argsorted
1109+
ndarray[intp_t, ndim=2] argsort_indexer
11091110
ndarray[uint8_t, ndim=2] mask
11101111
rank_t val, nan_value
11111112
float64_t count, sum_ranks = 0.0
11121113
int tiebreak = 0
11131114
int64_t idx
11141115
bint check_mask, condition, keep_na
1115-
const int64_t[:] labels
11161116

11171117
tiebreak = tiebreakers[ties_method]
11181118

@@ -1158,40 +1158,19 @@ def rank_2d(
11581158

11591159
n, k = (<object>values).shape
11601160
ranks = np.empty((n, k), dtype='f8')
1161-
# For compatibility when calling rank_1d
1162-
labels = np.zeros(k, dtype=np.int64)
11631161

1164-
if rank_t is object:
1165-
try:
1166-
_as = values.argsort(1)
1167-
except TypeError:
1168-
values = in_arr
1169-
for i in range(len(values)):
1170-
ranks[i] = rank_1d(
1171-
in_arr[i],
1172-
labels=labels,
1173-
ties_method=ties_method,
1174-
ascending=ascending,
1175-
pct=pct
1176-
)
1177-
if axis == 0:
1178-
return ranks.T
1179-
else:
1180-
return ranks
1162+
if tiebreak == TIEBREAK_FIRST:
1163+
# need to use a stable sort here
1164+
argsort_indexer = values.argsort(axis=1, kind='mergesort')
1165+
if not ascending:
1166+
tiebreak = TIEBREAK_FIRST_DESCENDING
11811167
else:
1182-
if tiebreak == TIEBREAK_FIRST:
1183-
# need to use a stable sort here
1184-
_as = values.argsort(axis=1, kind='mergesort')
1185-
if not ascending:
1186-
tiebreak = TIEBREAK_FIRST_DESCENDING
1187-
else:
1188-
_as = values.argsort(1)
1168+
argsort_indexer = values.argsort(1)
11891169

11901170
if not ascending:
1191-
_as = _as[:, ::-1]
1171+
argsort_indexer = argsort_indexer[:, ::-1]
11921172

1193-
values = _take_2d(values, _as)
1194-
argsorted = _as.astype('i8')
1173+
values = _take_2d(values, argsort_indexer)
11951174

11961175
for i in range(n):
11971176
dups = sum_ranks = infs = 0
@@ -1200,7 +1179,7 @@ def rank_2d(
12001179
count = 0.0
12011180
for j in range(k):
12021181
val = values[i, j]
1203-
idx = argsorted[i, j]
1182+
idx = argsort_indexer[i, j]
12041183
if keep_na and check_mask and mask[i, idx]:
12051184
ranks[i, idx] = NaN
12061185
infs += 1
@@ -1215,38 +1194,38 @@ def rank_2d(
12151194
condition = (
12161195
j == k - 1 or
12171196
are_diff(values[i, j + 1], val) or
1218-
(keep_na and check_mask and mask[i, argsorted[i, j + 1]])
1197+
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
12191198
)
12201199
else:
12211200
condition = (
12221201
j == k - 1 or
12231202
values[i, j + 1] != val or
1224-
(keep_na and check_mask and mask[i, argsorted[i, j + 1]])
1203+
(keep_na and check_mask and mask[i, argsort_indexer[i, j + 1]])
12251204
)
12261205

12271206
if condition:
12281207
if tiebreak == TIEBREAK_AVERAGE:
12291208
for z in range(j - dups + 1, j + 1):
1230-
ranks[i, argsorted[i, z]] = sum_ranks / dups
1209+
ranks[i, argsort_indexer[i, z]] = sum_ranks / dups
12311210
elif tiebreak == TIEBREAK_MIN:
12321211
for z in range(j - dups + 1, j + 1):
1233-
ranks[i, argsorted[i, z]] = j - dups + 2
1212+
ranks[i, argsort_indexer[i, z]] = j - dups + 2
12341213
elif tiebreak == TIEBREAK_MAX:
12351214
for z in range(j - dups + 1, j + 1):
1236-
ranks[i, argsorted[i, z]] = j + 1
1215+
ranks[i, argsort_indexer[i, z]] = j + 1
12371216
elif tiebreak == TIEBREAK_FIRST:
12381217
if rank_t is object:
12391218
raise ValueError('first not supported for non-numeric data')
12401219
else:
12411220
for z in range(j - dups + 1, j + 1):
1242-
ranks[i, argsorted[i, z]] = z + 1
1221+
ranks[i, argsort_indexer[i, z]] = z + 1
12431222
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
12441223
for z in range(j - dups + 1, j + 1):
1245-
ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2
1224+
ranks[i, argsort_indexer[i, z]] = 2 * j - z - dups + 2
12461225
elif tiebreak == TIEBREAK_DENSE:
12471226
total_tie_count += 1
12481227
for z in range(j - dups + 1, j + 1):
1249-
ranks[i, argsorted[i, z]] = total_tie_count
1228+
ranks[i, argsort_indexer[i, z]] = total_tie_count
12501229
sum_ranks = dups = 0
12511230
if pct:
12521231
if tiebreak == TIEBREAK_DENSE:

pandas/tests/frame/methods/test_rank.py

+12
Original file line numberDiff line numberDiff line change
@@ -445,3 +445,15 @@ def test_rank_both_inf(self):
445445
expected = DataFrame({"a": [1.0, 2.0, 3.0]})
446446
result = df.rank()
447447
tm.assert_frame_equal(result, expected)
448+
449+
@pytest.mark.parametrize(
450+
"data,expected",
451+
[
452+
({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})),
453+
({"a": [1, 2, "a"]}, DataFrame(index=range(3))),
454+
],
455+
)
456+
def test_rank_mixed_axis_zero(self, data, expected):
457+
df = DataFrame(data)
458+
result = df.rank()
459+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)