diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 26099a94834e8..23675752a4593 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict +from typing import Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non Returns ------- - labels : ndarray + codes : ndarray An integer ndarray that's an indexer into `uniques`. - ``uniques.take(labels)`` will have the same values as `values`. + ``uniques.take(codes)`` will have the same values as `values`. uniques : ndarray, Index, or Categorical The unique valid values. When `values` is Categorical, `uniques` is a Categorical. When `values` is some other pandas object, an @@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non ``pd.factorize(values)``. The results are identical for methods like :meth:`Series.factorize`. - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) - >>> labels + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) + >>> codes array([0, 0, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) - With ``sort=True``, the `uniques` will be sorted, and `labels` will be + With ``sort=True``, the `uniques` will be sorted, and `codes` will be shuffled so that the relationship is the maintained. - >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) - >>> labels + >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) + >>> codes array([1, 1, 0, 2, 1]) >>> uniques array(['a', 'b', 'c'], dtype=object) - Missing values are indicated in `labels` with `na_sentinel` + Missing values are indicated in `codes` with `na_sentinel` (``-1`` by default). Note that missing values are never included in `uniques`. - >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) - >>> labels + >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) + >>> codes array([ 0, -1, 1, 2, 0]) >>> uniques array(['b', 'a', 'c'], dtype=object) @@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non will differ. For Categoricals, a `Categorical` is returned. >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels + >>> codes, uniques = pd.factorize(cat) + >>> codes array([0, 0, 1]) >>> uniques [a, c] @@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non returned. >>> cat = pd.Series(['a', 'a', 'c']) - >>> labels, uniques = pd.factorize(cat) - >>> labels + >>> codes, uniques = pd.factorize(cat) + >>> codes array([0, 0, 1]) >>> uniques Index(['a', 'c'], dtype='object') @@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non sort=dedent( """\ sort : bool, default False - Sort `uniques` and shuffle `labels` to maintain the + Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), @@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non ) @Appender(_shared_docs["factorize"]) @deprecate_kwarg(old_arg_name="order", new_arg_name=None) -def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None): +def factorize( + values, + sort: bool = False, + order=None, + na_sentinel: int = -1, + size_hint: Optional[int] = None, +) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) - # 2.) factorizing labels and uniques - # 3.) Maybe boxing the output in an Index + # 2.) factorizing codes and uniques + # 3.) Maybe boxing the uniques in an Index # # Step 2 is dispatched to extension types (like Categorical). They are # responsible only for factorization. All data coercion, sorting and boxing @@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= if is_extension_array_dtype(values): values = extract_array(values) - labels, uniques = values.factorize(na_sentinel=na_sentinel) + codes, uniques = values.factorize(na_sentinel=na_sentinel) dtype = original.dtype else: values, dtype = _ensure_data(values) @@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= else: na_value = None - labels, uniques = _factorize_array( + codes, uniques = _factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) if sort and len(uniques) > 0: - uniques, labels = safe_sort( - uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False + uniques, codes = safe_sort( + uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False ) uniques = _reconstruct_data(uniques, dtype, original) @@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= uniques = Index(uniques) - return labels, uniques + return codes, uniques def value_counts( diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2980f0d4cb906..82dabe735581b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra Parameters ---------- na_sentinel : int, default -1 - Value to use in the `labels` array to indicate missing values. + Value to use in the `codes` array to indicate missing values. Returns ------- - labels : ndarray + codes : ndarray An integer NumPy array that's an indexer into the original ExtensionArray. uniques : ExtensionArray @@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra # Complete control over factorization. arr, na_value = self._values_for_factorize() - labels, uniques = _factorize_array( + codes, uniques = _factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) uniques = self._from_factorized(uniques, self) - return labels, uniques + return codes, uniques _extension_array_shared_docs[ "repeat" diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 075cdf09d531f..14024401ea110 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1): # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] # The sparsity on this is backwards from what Sparse would want. Want # ExtensionArray.factorize -> Tuple[EA, EA] - # Given that we have to return a dense array of labels, why bother + # Given that we have to return a dense array of codes, why bother # implementing an efficient factorize? - labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) + codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel) uniques = SparseArray(uniques, dtype=self.dtype) - return labels, uniques + return codes, uniques def value_counts(self, dropna=True): """ diff --git a/pandas/core/base.py b/pandas/core/base.py index ada0159d21e7e..10e7b5d186bba 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False): sort=textwrap.dedent( """\ sort : bool, default False - Sort `uniques` and shuffle `labels` to maintain the + Sort `uniques` and shuffle `codes` to maintain the relationship. """ ), diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1508fef86ae62..1a48ccf85f947 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -11,23 +11,23 @@ def test_factorize(categories, ordered): cat = pd.Categorical( ["b", "b", "a", "c", None], categories=categories, ordered=ordered ) - labels, uniques = pd.factorize(cat) - expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp) + codes, uniques = pd.factorize(cat) + expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp) expected_uniques = pd.Categorical( ["b", "a", "c"], categories=categories, ordered=ordered ) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) def test_factorized_sort(): cat = pd.Categorical(["b", "b", None, "a"]) - labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([1, 1, -1, 0], dtype=np.intp) expected_uniques = pd.Categorical(["a", "b"]) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) @@ -36,13 +36,13 @@ def test_factorized_sort_ordered(): ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True ) - labels, uniques = pd.factorize(cat, sort=True) - expected_labels = np.array([0, 0, -1, 1], dtype=np.intp) + codes, uniques = pd.factorize(cat, sort=True) + expected_codes = np.array([0, 0, -1, 1], dtype=np.intp) expected_uniques = pd.Categorical( ["b", "a"], categories=["c", "b", "a"], ordered=True ) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_categorical_equal(uniques, expected_uniques) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6d47b0c1d1f77..973088cb72e7a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -113,29 +113,29 @@ def test_unique(self, data, box, method): @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize(self, data_for_grouping, na_sentinel): - labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - expected_labels = np.array( + codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + expected_codes = np.array( [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp ) expected_uniques = data_for_grouping.take([0, 4, 7]) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) @pytest.mark.parametrize("na_sentinel", [-1, -2]) def test_factorize_equivalence(self, data_for_grouping, na_sentinel): - l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) - l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel) + codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) + codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) - tm.assert_numpy_array_equal(l1, l2) - self.assert_extension_array_equal(u1, u2) + tm.assert_numpy_array_equal(codes_1, codes_2) + self.assert_extension_array_equal(uniques_1, uniques_2) def test_factorize_empty(self, data): - labels, uniques = pd.factorize(data[:0]) - expected_labels = np.array([], dtype=np.intp) + codes, uniques = pd.factorize(data[:0]) + expected_codes = np.array([], dtype=np.intp) expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) self.assert_extension_array_equal(uniques, expected_uniques) def test_fillna_copy_frame(self, data_missing): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a64501040442d..ef844dd97120a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -32,39 +32,39 @@ class TestFactorize: def test_basic(self): - labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) + codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) - labels, uniques = algos.factorize( + codes, uniques = algos.factorize( ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True ) exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array(["a", "b", "c"], dtype=object) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(range(5)))) + codes, uniques = algos.factorize(list(reversed(range(5)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) + codes, uniques = algos.factorize(list(reversed(range(5))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) + codes, uniques = algos.factorize(list(reversed(np.arange(5.0)))) exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) - labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) + codes, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64) tm.assert_numpy_array_equal(uniques, exp) @@ -72,16 +72,16 @@ def test_mixed(self): # doc example reshaping.rst x = Series(["A", "A", np.nan, "B", 3.14, np.inf]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = Index(["A", "B", 3.14, np.inf]) tm.assert_index_equal(uniques, exp) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = Index([3.14, np.inf, "A", "B"]) tm.assert_index_equal(uniques, exp) @@ -91,16 +91,16 @@ def test_datelike(self): v1 = Timestamp("20130101 09:00:00.00004") v2 = Timestamp("20130101") x = Series([v1, v1, v1, v2, v2, v1]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = DatetimeIndex([v1, v2]) tm.assert_index_equal(uniques, exp) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) exp = DatetimeIndex([v2, v1]) tm.assert_index_equal(uniques, exp) @@ -110,28 +110,28 @@ def test_datelike(self): x = Series([v1, v1, v1, v2, v2, v1]) # periods are not 'sorted' as they are converted back into an index - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) # GH 5986 v1 = pd.to_timedelta("1 day 1 min") v2 = pd.to_timedelta("1 day") x = Series([v1, v2, v1, v1, v2, v2, v1]) - labels, uniques = algos.factorize(x) + codes, uniques = algos.factorize(x) exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) - labels, uniques = algos.factorize(x, sort=True) + codes, uniques = algos.factorize(x, sort=True) exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) - tm.assert_numpy_array_equal(labels, exp) + tm.assert_numpy_array_equal(codes, exp) tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) def test_factorize_nan(self): @@ -158,7 +158,7 @@ def test_factorize_nan(self): tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel) @pytest.mark.parametrize( - "data,expected_label,expected_level", + "data, expected_codes, expected_uniques", [ ( [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"], @@ -173,14 +173,14 @@ def test_factorize_nan(self): ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]), ], ) - def test_factorize_tuple_list(self, data, expected_label, expected_level): + def test_factorize_tuple_list(self, data, expected_codes, expected_uniques): # GH9454 - result = pd.factorize(data) + codes, uniques = pd.factorize(data) - tm.assert_numpy_array_equal(result[0], np.array(expected_label, dtype=np.intp)) + tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp)) - expected_level_array = com.asarray_tuplesafe(expected_level, dtype=object) - tm.assert_numpy_array_equal(result[1], expected_level_array) + expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object) + tm.assert_numpy_array_equal(uniques, expected_uniques_array) def test_complex_sorting(self): # gh 12666 - check no segfault @@ -197,52 +197,52 @@ def test_complex_sorting(self): def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) - exp_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) + expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp) + expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_uint64_factorize(self, writable): data = np.array([2 ** 64 - 1, 1, 2 ** 64 - 1], dtype=np.uint64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 64 - 1, 1], dtype=np.uint64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_int64_factorize(self, writable): data = np.array([2 ** 63 - 1, -2 ** 63, 2 ** 63 - 1], dtype=np.int64) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0], dtype=np.intp) - exp_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + expected_uniques = np.array([2 ** 63 - 1, -2 ** 63], dtype=np.int64) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_string_factorize(self, writable): data = np.array(["a", "c", "a", "b", "c"], dtype=object) data.setflags(write=writable) - exp_labels = np.array([0, 1, 0, 2, 1], dtype=np.intp) - exp_uniques = np.array(["a", "c", "b"], dtype=object) + expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_object_factorize(self, writable): data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object) data.setflags(write=writable) - exp_labels = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) - exp_uniques = np.array(["a", "c", "b"], dtype=object) + expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp) + expected_uniques = np.array(["a", "c", "b"], dtype=object) - labels, uniques = algos.factorize(data) - tm.assert_numpy_array_equal(labels, exp_labels) - tm.assert_numpy_array_equal(uniques, exp_uniques) + codes, uniques = algos.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. @@ -263,11 +263,11 @@ def test_deprecate_order(self): ) def test_parametrized_factorize_na_value_default(self, data): # arrays that include the NA default for that type, but isn't used. - l, u = algos.factorize(data) + codes, uniques = algos.factorize(data) expected_uniques = data[[0, 1]] - expected_labels = np.array([0, 1, 0], dtype=np.intp) - tm.assert_numpy_array_equal(l, expected_labels) - tm.assert_numpy_array_equal(u, expected_uniques) + expected_codes = np.array([0, 1, 0], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( "data, na_value", @@ -282,11 +282,11 @@ def test_parametrized_factorize_na_value_default(self, data): ], ) def test_parametrized_factorize_na_value(self, data, na_value): - l, u = algos._factorize_array(data, na_value=na_value) + codes, uniques = algos._factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] - expected_labels = np.array([-1, 0, -1, 1], dtype=np.intp) - tm.assert_numpy_array_equal(l, expected_labels) - tm.assert_numpy_array_equal(u, expected_uniques) + expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("na_sentinel", [-1, -10, 100]) @@ -305,14 +305,14 @@ def test_parametrized_factorize_na_value(self, data, na_value): ids=["numpy_array", "extension_array"], ) def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): - labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) + codes, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: - expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) + expected_codes = np.array([1, 0, na_sentinel, 1], dtype=np.intp) expected_uniques = algos.safe_sort(uniques) else: - expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) + expected_codes = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = uniques - tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_numpy_array_equal(codes, expected_codes) if isinstance(data, np.ndarray): tm.assert_numpy_array_equal(uniques, expected_uniques) else: diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1f19f58e80f26..21fed62e51fdf 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -707,9 +707,9 @@ def test_factorize(self): else: exp_arr = np.array(range(len(o)), dtype=np.intp) exp_uniques = o - labels, uniques = o.factorize() + codes, uniques = o.factorize() - tm.assert_numpy_array_equal(labels, exp_arr) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): tm.assert_index_equal(uniques, Index(orig), check_names=False) else: @@ -736,9 +736,9 @@ def test_factorize_repeated(self): exp_arr = np.array( [5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp ) - labels, uniques = n.factorize(sort=True) + codes, uniques = n.factorize(sort=True) - tm.assert_numpy_array_equal(labels, exp_arr) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): tm.assert_index_equal( uniques, Index(orig).sort_values(), check_names=False @@ -747,8 +747,8 @@ def test_factorize_repeated(self): tm.assert_index_equal(uniques, o, check_names=False) exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], np.intp) - labels, uniques = n.factorize(sort=False) - tm.assert_numpy_array_equal(labels, exp_arr) + codes, uniques = n.factorize(sort=False) + tm.assert_numpy_array_equal(codes, exp_arr) if isinstance(o, Series): expected = Index(o.iloc[5:10].append(o.iloc[:5]))