From 23643305b2b42c3f76f6982c3b98960283fee594 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 18:22:19 -0400 Subject: [PATCH 01/13] CLN: rank_1d followup --- pandas/_libs/algos.pyx | 86 +++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 047eb848b7540..acbe114db82c6 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,8 +951,9 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels + bint keep_na, at_end, next_val_diff, check_labels, set_as_na rank_t nan_fill_val + float computed_rank tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -1037,11 +1038,8 @@ def rank_1d( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) @@ -1051,28 +1049,32 @@ def rank_1d( ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - grp_na_count = dups + set_as_na = keep_na and mask[lexsort_indexer[i]] + + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index + if set_as_na: + computed_rank = NaN + grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + computed_rank = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + computed_rank = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + else: for j in range(i - dups + 1, i + 1): if ascending: out[lexsort_indexer[j]] = j + 1 - grp_start else: - out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start - elif tiebreak == TIEBREAK_DENSE: + out[lexsort_indexer[j]] = \ + (2 * i - j - dups + 2 - grp_start) + + if set_as_na or tiebreak != TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[lexsort_indexer[j]] = computed_rank # look forward to the next value (using the sorting in _as) # if the value does not equal the current value then we need to @@ -1083,7 +1085,6 @@ def rank_1d( ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where @@ -1102,10 +1103,9 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 else: @@ -1124,11 +1124,8 @@ def rank_1d( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if not at_end: - next_val_diff = (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) - else: - next_val_diff = True + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) @@ -1138,29 +1135,32 @@ def rank_1d( ): # if keep_na, check for missing values and assign back # to the result where appropriate - if keep_na and mask[lexsort_indexer[i]]: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = NaN - grp_na_count = dups + set_as_na = keep_na and mask[lexsort_indexer[i]] + + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index + if set_as_na: + computed_rank = NaN + grp_na_count = dups elif tiebreak == TIEBREAK_AVERAGE: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = sum_ranks / dups + computed_rank = sum_ranks / dups elif tiebreak == TIEBREAK_MIN: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start - dups + 2 + computed_rank = i - grp_start - dups + 2 elif tiebreak == TIEBREAK_MAX: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = i - grp_start + 1 - elif tiebreak == TIEBREAK_FIRST: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + else: for j in range(i - dups + 1, i + 1): if ascending: out[lexsort_indexer[j]] = j + 1 - grp_start else: out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - elif tiebreak == TIEBREAK_DENSE: + + if set_as_na or tiebreak != TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = grp_vals_seen + out[lexsort_indexer[j]] = computed_rank # look forward to the next value (using the sorting in # lexsort_indexer) if the value does not equal the current @@ -1171,7 +1171,6 @@ def rank_1d( ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 - grp_tie_count += 1 # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where @@ -1189,10 +1188,9 @@ def rank_1d( else: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ - (grp_tie_count - (grp_na_count > 0)) + (grp_vals_seen - 1 - (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 From 999d8802bd0044ba37f589eb3b0ef7ea4b17916a Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:00:35 -0400 Subject: [PATCH 02/13] WIP --- pandas/_libs/algos.pyx | 47 ++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index acbe114db82c6..caec06d498312 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -963,6 +963,7 @@ def rank_1d( assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N) + # If all 0 labels, can short-circuit later label # comparisons check_labels = np.any(labels) @@ -1026,6 +1027,7 @@ def rank_1d( if rank_t is object: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset # when either of those change @@ -1033,20 +1035,23 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): # if keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1092,10 +1097,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if (at_end or - (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]]))): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ @@ -1112,6 +1114,7 @@ def rank_1d( with nogil: for i in range(N): at_end = i == N - 1 + # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset # when either of those change @@ -1119,14 +1122,20 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) + # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) - if (next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) or (check_labels @@ -1137,8 +1146,8 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index + # For all cases except TIEBREAK_FIRST and a non-null value, + # we set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups @@ -1178,9 +1187,7 @@ def rank_1d( # decrement that from their position. fill in the size of each # group encountered (used by pct calculations later). also be # sure to reset any of the items helping to calculate dups - if at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])): + if group_changed: if tiebreak != TIEBREAK_DENSE: for j in range(grp_start, i + 1): grp_sizes[lexsort_indexer[j]] = \ From d360871bd03f62c4fd245ea1d026f2e804c86af9 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:13:43 -0400 Subject: [PATCH 03/13] WIP --- pandas/_libs/algos.pyx | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index caec06d498312..cb968d9e8b1e6 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,7 +951,7 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, set_as_na + bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed rank_t nan_fill_val float computed_rank @@ -1086,8 +1086,7 @@ def rank_1d( # reset the dups and sum_ranks, knowing that a new value is # coming up. the conditional also needs to handle nan equality # and the end of iteration - if next_val_diff or (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]]): + if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1122,32 +1121,29 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] - != masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help group_changed = at_end or (check_labels and - (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) + (labels[lexsort_indexer[i]] + != labels[lexsort_indexer[i+1]])) # Update out only when there is a transition of values or labels. # When a new value or group is encountered, go back #dups steps( # the number of occurrence of current value) and assign the ranks # based on the starting index of the current group (grp_start) # and the current index - if (next_val_diff - or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]) - or (check_labels - and (labels[lexsort_indexer[i]] - != labels[lexsort_indexer[i+1]])) - ): + if (next_val_diff or group_changed + or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): + # if keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST and a non-null value, - # we set the same value at each index + # For all cases except TIEBREAK_FIRST when not setting + # nulls, we set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups @@ -1171,13 +1167,12 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in - # lexsort_indexer) if the value does not equal the current - # value then we need to reset the dups and sum_ranks, - # knowing that a new value is coming up. the conditional - # also needs to handle nan equality and the end of iteration - if next_val_diff or (mask[lexsort_indexer[i]] - ^ mask[lexsort_indexer[i+1]]): + # look forward to the next value (using the sorting in _as) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. the conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 From 0aaeee71b6cda176fc89946b32f333d382381a7d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:21:22 -0400 Subject: [PATCH 04/13] WIP --- pandas/_libs/algos.pyx | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index cb968d9e8b1e6..d06ab47c116e9 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -985,6 +985,8 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) + # If ascending is true and na_option == 'bottom', + # fill with the largest so NaN if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -1030,13 +1032,12 @@ def rank_1d( # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1052,7 +1053,7 @@ def rank_1d( if (next_val_diff or group_changed or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - # if keep_na, check for missing values and assign back + # If keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1081,11 +1082,15 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in _as) + # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality + # coming up. The conditional also needs to handle nan equality # and the end of iteration + + # This condition is equivalent to `next_val_diff or + # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` + # Helps potentially avoid 2 mask lookups if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1093,8 +1098,8 @@ def rank_1d( # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups if group_changed: if tiebreak != TIEBREAK_DENSE: @@ -1116,8 +1121,7 @@ def rank_1d( # dups and sum_ranks will be incremented each loop where # the value / group remains the same, and should be reset - # when either of those change - # Used to calculate tiebreakers + # when either of those change. Used to calculate tiebreakers dups += 1 sum_ranks += i - grp_start + 1 @@ -1138,7 +1142,7 @@ def rank_1d( if (next_val_diff or group_changed or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]])): - # if keep_na, check for missing values and assign back + # If keep_na, check for missing values and assign back # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] @@ -1167,11 +1171,15 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = computed_rank - # look forward to the next value (using the sorting in _as) + # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is - # coming up. the conditional also needs to handle nan equality + # coming up. The conditional also needs to handle nan equality # and the end of iteration + + # This condition is equivalent to `next_val_diff or + # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` + # Helps potentially avoid 2 mask lookups if next_val_diff or not group_changed: dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1179,8 +1187,8 @@ def rank_1d( # Similar to the previous conditional, check now if we are # moving to a new group. If so, keep track of the index where # the new group occurs, so the tiebreaker calculations can - # decrement that from their position. fill in the size of each - # group encountered (used by pct calculations later). also be + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be # sure to reset any of the items helping to calculate dups if group_changed: if tiebreak != TIEBREAK_DENSE: From fe6495a289b97d21a5bb559b09813eebe2781213 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 19:54:33 -0400 Subject: [PATCH 05/13] Add comments, whitespace --- pandas/_libs/algos.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d06ab47c116e9..d24c1452be9c4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -985,8 +985,9 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) - # If ascending is true and na_option == 'bottom', - # fill with the largest so NaN + # If ascending and na_option == 'bottom' or descending and + # na_option == 'top' -> we want to rank NaN as the highest + # so fill with the maximum value for the type if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -997,6 +998,8 @@ def rank_1d( else: nan_fill_val = np.inf order = (masked_vals, mask, labels) + + # Otherwise, fill with the lowest value of the type else: if rank_t is object: nan_fill_val = NegInfinity() @@ -1036,8 +1039,8 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or are_diff(masked_vals[lexsort_indexer[i]], + masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1058,7 +1061,7 @@ def rank_1d( set_as_na = keep_na and mask[lexsort_indexer[i]] # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index + # nulls, we can set the same value at each index if set_as_na: computed_rank = NaN grp_na_count = dups From 8fae616f2edfd6dc2936f23c8273ad7e006fdf06 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 20:17:35 -0400 Subject: [PATCH 06/13] Simplify conditional --- pandas/_libs/algos.pyx | 82 ++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d24c1452be9c4..a016dbcc07280 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1060,19 +1060,25 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we can set the same value at each index - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen + # For all cases except TIEBREAK_FIRST for non-null values + # we set the same value at each index + if set_as_na or tiebreak != TIEBREAK_FIRST: + if set_as_na: + computed_rank = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + computed_rank = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + computed_rank = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = computed_rank + + # Otherwise, we need to iterate a compute a rank per index else: for j in range(i - dups + 1, i + 1): if ascending: @@ -1081,10 +1087,6 @@ def rank_1d( out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - if set_as_na or tiebreak != TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is @@ -1149,19 +1151,25 @@ def rank_1d( # to the result where appropriate set_as_na = keep_na and mask[lexsort_indexer[i]] - # For all cases except TIEBREAK_FIRST when not setting - # nulls, we set the same value at each index - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen + # For all cases except TIEBREAK_FIRST for non-null values + # we set the same value at each index + if set_as_na or tiebreak != TIEBREAK_FIRST: + if set_as_na: + computed_rank = NaN + grp_na_count = dups + elif tiebreak == TIEBREAK_AVERAGE: + computed_rank = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + computed_rank = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + computed_rank = i - grp_start + 1 + elif tiebreak == TIEBREAK_DENSE: + computed_rank = grp_vals_seen + + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = computed_rank + + # Otherwise, we need to iterate a compute a rank per index else: for j in range(i - dups + 1, i + 1): if ascending: @@ -1170,15 +1178,11 @@ def rank_1d( out[lexsort_indexer[j]] = \ (2 * i - j - dups + 2 - grp_start) - if set_as_na or tiebreak != TIEBREAK_FIRST: - for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Look forward to the next value (using the sorting in lexsort_indexer) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. The conditional also needs to handle nan equality - # and the end of iteration + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration # This condition is equivalent to `next_val_diff or # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` From f9479e377d5ae3c7ee19db84a85816d25b2b7ffd Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sat, 20 Mar 2021 20:35:02 -0400 Subject: [PATCH 07/13] Remove unused var --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index a016dbcc07280..b03d06bfba98b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -946,7 +946,7 @@ def rank_1d( cdef: TiebreakEnumType tiebreak Py_ssize_t i, j, N, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0 ndarray[int64_t, ndim=1] lexsort_indexer ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals From a2bea3d933113f1af2253b9b411a1d94de191361 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Sun, 21 Mar 2021 01:07:07 -0400 Subject: [PATCH 08/13] Avoid compiler warning --- pandas/_libs/algos.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index b03d06bfba98b..ca0b1c19aec60 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -953,7 +953,7 @@ def rank_1d( ndarray[uint8_t, ndim=1] mask bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed rank_t nan_fill_val - float computed_rank + float64_t computed_rank = 0 tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' From ba5dc7cc6ab1e1a103dcef775acf3992f5775981 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 17:58:39 -0400 Subject: [PATCH 09/13] Simplify changes --- pandas/_libs/algos.pyx | 129 ++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 72 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ca0b1c19aec60..281d2b9a38b2f 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -951,11 +951,14 @@ def rank_1d( ndarray[float64_t, ndim=1] grp_sizes, out ndarray[rank_t, ndim=1] masked_vals ndarray[uint8_t, ndim=1] mask - bint keep_na, at_end, next_val_diff, check_labels, set_as_na, group_changed + bint keep_na, at_end, next_val_diff, check_labels, group_changed rank_t nan_fill_val - float64_t computed_rank = 0 tiebreak = tiebreakers[ties_method] + if tiebreak == TIEBREAK_FIRST: + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + keep_na = na_option == 'keep' N = len(values) @@ -1058,45 +1061,36 @@ def rank_1d( # If keep_na, check for missing values and assign back # to the result where appropriate - set_as_na = keep_na and mask[lexsort_indexer[i]] - - # For all cases except TIEBREAK_FIRST for non-null values - # we set the same value at each index - if set_as_na or tiebreak != TIEBREAK_FIRST: - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen - + if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Otherwise, we need to iterate a compute a rank per index - else: + out[lexsort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) + out[lexsort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = j + 1 - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = grp_vals_seen # Look forward to the next value (using the sorting in lexsort_indexer) # if the value does not equal the current value then we need to # reset the dups and sum_ranks, knowing that a new value is # coming up. The conditional also needs to handle nan equality # and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 @@ -1130,8 +1124,8 @@ def rank_1d( dups += 1 sum_ranks += i - grp_start + 1 - next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] != - masked_vals[lexsort_indexer[i+1]]) + next_val_diff = at_end or (masked_vals[lexsort_indexer[i]] + != masked_vals[lexsort_indexer[i+1]]) # We'll need this check later anyway to determine group size, so just # compute it here since shortcircuiting won't help @@ -1149,45 +1143,36 @@ def rank_1d( # If keep_na, check for missing values and assign back # to the result where appropriate - set_as_na = keep_na and mask[lexsort_indexer[i]] - - # For all cases except TIEBREAK_FIRST for non-null values - # we set the same value at each index - if set_as_na or tiebreak != TIEBREAK_FIRST: - if set_as_na: - computed_rank = NaN - grp_na_count = dups - elif tiebreak == TIEBREAK_AVERAGE: - computed_rank = sum_ranks / dups - elif tiebreak == TIEBREAK_MIN: - computed_rank = i - grp_start - dups + 2 - elif tiebreak == TIEBREAK_MAX: - computed_rank = i - grp_start + 1 - elif tiebreak == TIEBREAK_DENSE: - computed_rank = grp_vals_seen - + if keep_na and mask[lexsort_indexer[i]]: + grp_na_count = dups for j in range(i - dups + 1, i + 1): - out[lexsort_indexer[j]] = computed_rank - - # Otherwise, we need to iterate a compute a rank per index - else: + out[lexsort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: for j in range(i - dups + 1, i + 1): - if ascending: - out[lexsort_indexer[j]] = j + 1 - grp_start - else: - out[lexsort_indexer[j]] = \ - (2 * i - j - dups + 2 - grp_start) - - # Look forward to the next value (using the sorting in - # lexsort_indexer). If the value does not equal the current - # value then we need to reset the dups and sum_ranks, knowing - # that a new value is coming up. The conditional also needs - # to handle nan equality and the end of iteration - - # This condition is equivalent to `next_val_diff or - # (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]))` - # Helps potentially avoid 2 mask lookups - if next_val_diff or not group_changed: + out[lexsort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = i - grp_start + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = j + 1 - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[lexsort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in lexsort_indexer) + # if the value does not equal the current value then we need to + # reset the dups and sum_ranks, knowing that a new value is + # coming up. The conditional also needs to handle nan equality + # and the end of iteration + if next_val_diff or (mask[lexsort_indexer[i]] + ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 grp_vals_seen += 1 From f6a04b79377bb94dab67a9ab938c1e5939d16325 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Mon, 22 Mar 2021 18:08:02 -0400 Subject: [PATCH 10/13] precommit fixup --- pandas/_libs/algos.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 281d2b9a38b2f..4ae134b42b243 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1166,11 +1166,11 @@ def rank_1d( for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = grp_vals_seen - # Look forward to the next value (using the sorting in lexsort_indexer) - # if the value does not equal the current value then we need to - # reset the dups and sum_ranks, knowing that a new value is - # coming up. The conditional also needs to handle nan equality - # and the end of iteration + # Look forward to the next value (using the sorting in + # lexsort_indexer) if the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration if next_val_diff or (mask[lexsort_indexer[i]] ^ mask[lexsort_indexer[i+1]]): dups = sum_ranks = 0 From e1df693920fd44fc3a1c3064ab260e7046b1571b Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 10:42:23 -0400 Subject: [PATCH 11/13] Update ascending, na_option comment --- pandas/_libs/algos.pyx | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 4ae134b42b243..2de8e91198ee1 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -988,9 +988,12 @@ def rank_1d( else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) - # If ascending and na_option == 'bottom' or descending and - # na_option == 'top' -> we want to rank NaN as the highest - # so fill with the maximum value for the type + # If `na_option == 'top'`, we want to assign the lowest rank + # to NaN regardless of ascending/descending. So if ascending, + # fill with lowest value of type to end up with lowest rank. + # If descending, fill with highest value since descending + # will flip the ordering to still end up with lowest rank. + # Symmetric logic applies to `na_option == 'bottom'` if ascending ^ (na_option == 'top'): if rank_t is object: nan_fill_val = Infinity() @@ -1001,8 +1004,6 @@ def rank_1d( else: nan_fill_val = np.inf order = (masked_vals, mask, labels) - - # Otherwise, fill with the lowest value of the type else: if rank_t is object: nan_fill_val = NegInfinity() From 93b071765777a4f49893d3180db436fa73239f05 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 13:28:44 -0400 Subject: [PATCH 12/13] Explain tiebreak first behavior --- pandas/_libs/algos.pyx | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 2de8e91198ee1..da2b501ca4941 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1075,9 +1075,17 @@ def rank_1d( elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n+1, n+2...n+m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n+m, n+(m-1)...n+1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start @@ -1157,9 +1165,17 @@ def rank_1d( elif tiebreak == TIEBREAK_MAX: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = 2 * i - j - dups + 2 - grp_start From c04562dc9591d7453261bead70076ed34ab200f1 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 13:30:04 -0400 Subject: [PATCH 13/13] Consistent spacing --- pandas/_libs/algos.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index da2b501ca4941..cda20e536c11c 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1078,13 +1078,13 @@ def rank_1d( # With n as the previous rank in the group and m as the number # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, - # then rankings should be n+1, n+2...n+m + # then rankings should be n + 1, n + 2 ... n + m elif tiebreak == TIEBREAK_FIRST: for j in range(i - dups + 1, i + 1): out[lexsort_indexer[j]] = j + 1 - grp_start # If TIEBREAK_FIRST and descending, the ranking should be - # n+m, n+(m-1)...n+1. This is equivalent to + # n + m, n + (m - 1) ... n + 1. This is equivalent to # (i - dups + 1) + (i - j + 1) - grp_start elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1):