Skip to content

Commit 59b431f

Browse files
TomAugspurgerjreback
authored andcommitted
DOC/TST: Indexing with NA raises (pandas-dev#30308)
1 parent 7b35099 commit 59b431f

File tree

21 files changed

+304
-29
lines changed

21 files changed

+304
-29
lines changed

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ repos:
1111
language: python_venv
1212
additional_dependencies: [flake8-comprehensions>=3.1.0]
1313
- repo: https://github.com/pre-commit/mirrors-isort
14-
rev: v4.3.20
14+
rev: v4.3.21
1515
hooks:
1616
- id: isort
1717
language: python_venv

asv_bench/benchmarks/indexing.py

+4
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ def setup(self):
131131
self.col_scalar = columns[10]
132132
self.bool_indexer = self.df[self.col_scalar] > 0
133133
self.bool_obj_indexer = self.bool_indexer.astype(object)
134+
self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean")
134135

135136
def time_loc(self):
136137
self.df.loc[self.idx_scalar, self.col_scalar]
@@ -144,6 +145,9 @@ def time_boolean_rows(self):
144145
def time_boolean_rows_object(self):
145146
self.df[self.bool_obj_indexer]
146147

148+
def time_boolean_rows_boolean(self):
149+
self.df[self.boolean_indexer]
150+
147151

148152
class DataFrameNumericIndexing:
149153
def setup(self):

doc/source/reference/extensions.rst

+8
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,11 @@ objects.
5959
api.extensions.ExtensionArray.nbytes
6060
api.extensions.ExtensionArray.ndim
6161
api.extensions.ExtensionArray.shape
62+
63+
Additionally, we have some utility methods for ensuring your object
64+
behaves correctly.
65+
66+
.. autosummary::
67+
:toctree: api/
68+
69+
api.indexers.check_bool_array_indexer

doc/source/user_guide/boolean.rst

+23
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,29 @@ Nullable Boolean Data Type
1414

1515
.. versionadded:: 1.0.0
1616

17+
18+
.. _boolean.indexing:
19+
20+
Indexing with NA values
21+
-----------------------
22+
23+
pandas does not allow indexing with NA values. Attempting to do so
24+
will raise a ``ValueError``.
25+
26+
.. ipython:: python
27+
:okexcept:
28+
29+
s = pd.Series([1, 2, 3])
30+
mask = pd.array([True, False, pd.NA], dtype="boolean")
31+
s[mask]
32+
33+
The missing values will need to be explicitly filled with True or False prior
34+
to using the array as a mask.
35+
36+
.. ipython:: python
37+
38+
s[mask.fillna(False)]
39+
1740
.. _boolean.kleene:
1841

1942
Kleene Logical Operations

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -820,6 +820,7 @@ Datetimelike
820820
- Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`)
821821
- Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`)
822822
- Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`)
823+
- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`)
823824
- Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`)
824825
- Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`)
825826
- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`)

pandas/api/indexers/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
"""Public API for Rolling Window Indexers"""
2+
from pandas.core.indexers import check_bool_array_indexer # noqa: F401
23
from pandas.core.window.indexers import BaseIndexer # noqa: F401

pandas/core/arrays/boolean.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
from pandas.core import nanops, ops
3030
from pandas.core.algorithms import take
3131
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
32+
import pandas.core.common as com
33+
from pandas.core.indexers import check_bool_array_indexer
3234

3335
if TYPE_CHECKING:
3436
from pandas._typing import Scalar
@@ -307,11 +309,22 @@ def _from_factorized(cls, values, original: "BooleanArray"):
307309
def _formatter(self, boxed=False):
308310
return str
309311

312+
@property
313+
def _hasna(self) -> bool:
314+
# Note: this is expensive right now! The hope is that we can
315+
# make this faster by having an optional mask, but not have to change
316+
# source code using it..
317+
return self._mask.any()
318+
310319
def __getitem__(self, item):
311320
if is_integer(item):
312321
if self._mask[item]:
313322
return self.dtype.na_value
314323
return self._data[item]
324+
325+
elif com.is_bool_indexer(item):
326+
item = check_bool_array_indexer(self, item)
327+
315328
return type(self)(self._data[item], self._mask[item])
316329

317330
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
@@ -329,7 +342,7 @@ def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
329342
if dtype is None:
330343
dtype = object
331344
if is_bool_dtype(dtype):
332-
if not self.isna().any():
345+
if not self._hasna:
333346
return self._data
334347
else:
335348
raise ValueError(
@@ -503,7 +516,7 @@ def astype(self, dtype, copy=True):
503516

504517
if is_bool_dtype(dtype):
505518
# astype_nansafe converts np.nan to True
506-
if self.isna().any():
519+
if self._hasna:
507520
raise ValueError("cannot convert float NaN to bool")
508521
else:
509522
return self._data.astype(dtype, copy=copy)
@@ -515,7 +528,7 @@ def astype(self, dtype, copy=True):
515528
)
516529
# for integer, error if there are missing values
517530
if is_integer_dtype(dtype):
518-
if self.isna().any():
531+
if self._hasna:
519532
raise ValueError("cannot convert NA to integer")
520533
# for float dtype, ensure we use np.nan before casting (numpy cannot
521534
# deal with pd.NA)

pandas/core/arrays/categorical.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
5050
import pandas.core.common as com
5151
from pandas.core.construction import array, extract_array, sanitize_array
52+
from pandas.core.indexers import check_bool_array_indexer
5253
from pandas.core.missing import interpolate_2d
5354
from pandas.core.ops.common import unpack_zerodim_and_defer
5455
from pandas.core.sorting import nargsort
@@ -1996,10 +1997,13 @@ def __getitem__(self, key):
19961997
return np.nan
19971998
else:
19981999
return self.categories[i]
1999-
else:
2000-
return self._constructor(
2001-
values=self._codes[key], dtype=self.dtype, fastpath=True
2002-
)
2000+
2001+
elif com.is_bool_indexer(key):
2002+
key = check_bool_array_indexer(self, key)
2003+
2004+
return self._constructor(
2005+
values=self._codes[key], dtype=self.dtype, fastpath=True
2006+
)
20032007

20042008
def __setitem__(self, key, value):
20052009
"""

pandas/core/arrays/datetimelike.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from pandas.core import missing, nanops
4141
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
4242
import pandas.core.common as com
43+
from pandas.core.indexers import check_bool_array_indexer
4344
from pandas.core.ops.common import unpack_zerodim_and_defer
4445
from pandas.core.ops.invalid import make_invalid_op
4546

@@ -436,7 +437,7 @@ def __getitem__(self, key):
436437
return type(self)(val, dtype=self.dtype)
437438

438439
if com.is_bool_indexer(key):
439-
key = np.asarray(key, dtype=bool)
440+
key = check_bool_array_indexer(self, key)
440441
if key.all():
441442
key = slice(0, None, None)
442443
else:

pandas/core/arrays/integer.py

+6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from pandas.core import nanops, ops
2727
from pandas.core.algorithms import take
2828
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
29+
import pandas.core.common as com
30+
from pandas.core.indexers import check_bool_array_indexer
2931
from pandas.core.ops import invalid_comparison
3032
from pandas.core.ops.common import unpack_zerodim_and_defer
3133
from pandas.core.tools.numeric import to_numeric
@@ -368,6 +370,10 @@ def __getitem__(self, item):
368370
if self._mask[item]:
369371
return self.dtype.na_value
370372
return self._data[item]
373+
374+
elif com.is_bool_indexer(item):
375+
item = check_bool_array_indexer(self, item)
376+
371377
return type(self)(self._data[item], self._mask[item])
372378

373379
def _coerce_to_ndarray(self, dtype=None, na_value=lib._no_default):

pandas/core/arrays/numpy_.py

+5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
from pandas import compat
1818
from pandas.core import nanops
1919
from pandas.core.algorithms import searchsorted, take, unique
20+
import pandas.core.common as com
2021
from pandas.core.construction import extract_array
22+
from pandas.core.indexers import check_bool_array_indexer
2123
from pandas.core.missing import backfill_1d, pad_1d
2224

2325
from .base import ExtensionArray, ExtensionOpsMixin
@@ -234,6 +236,9 @@ def __getitem__(self, item):
234236
if isinstance(item, type(self)):
235237
item = item._ndarray
236238

239+
elif com.is_bool_indexer(item):
240+
item = check_bool_array_indexer(self, item)
241+
237242
result = self._ndarray[item]
238243
if not lib.is_scalar(item):
239244
result = type(self)(result)

pandas/core/arrays/sparse/array.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,9 @@ def value_counts(self, dropna=True):
738738
# --------
739739

740740
def __getitem__(self, key):
741+
# avoid mypy issues when importing at the top-level
742+
from pandas.core.indexing import check_bool_indexer
743+
741744
if isinstance(key, tuple):
742745
if len(key) > 1:
743746
raise IndexError("too many indices for array.")
@@ -766,7 +769,9 @@ def __getitem__(self, key):
766769
else:
767770
key = np.asarray(key)
768771

769-
if com.is_bool_indexer(key) and len(self) == len(key):
772+
if com.is_bool_indexer(key):
773+
key = check_bool_indexer(self, key)
774+
770775
return self.take(np.arange(len(key), dtype=np.int32)[key])
771776
elif hasattr(key, "__len__"):
772777
return self.take(key)

pandas/core/common.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool:
111111
Returns
112112
-------
113113
bool
114+
Whether `key` is a valid boolean indexer.
114115
115116
Raises
116117
------
117118
ValueError
118119
When the array is an object-dtype ndarray or ExtensionArray
119120
and contains missing values.
121+
122+
See Also
123+
--------
124+
check_bool_array_indexer : Check that `key`
125+
is a valid mask for an array, and convert to an ndarray.
120126
"""
121-
na_msg = "cannot index with vector containing NA / NaN values"
127+
na_msg = "cannot mask with array containing NA / NaN values"
122128
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
123129
is_array_like(key) and is_extension_array_dtype(key.dtype)
124130
):

pandas/core/indexers.py

+67
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
"""
44
import numpy as np
55

6+
from pandas._typing import AnyArrayLike
7+
68
from pandas.core.dtypes.common import is_list_like
79
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
810

@@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int:
240242
elif not is_list_like_indexer(indexer):
241243
return 1
242244
raise AssertionError("cannot find the length of the indexer")
245+
246+
247+
def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
248+
"""
249+
Check if `mask` is a valid boolean indexer for `array`.
250+
251+
`array` and `mask` are checked to have the same length, and the
252+
dtype is validated.
253+
254+
.. versionadded:: 1.0.0
255+
256+
Parameters
257+
----------
258+
array : array
259+
The array that's being masked.
260+
mask : array
261+
The boolean array that's masking.
262+
263+
Returns
264+
-------
265+
numpy.ndarray
266+
The validated boolean mask.
267+
268+
Raises
269+
------
270+
IndexError
271+
When the lengths don't match.
272+
ValueError
273+
When `mask` cannot be converted to a bool-dtype ndarray.
274+
275+
See Also
276+
--------
277+
api.extensions.is_bool_indexer : Check if `key` is a boolean indexer.
278+
279+
Examples
280+
--------
281+
A boolean ndarray is returned when the arguments are all valid.
282+
283+
>>> mask = pd.array([True, False])
284+
>>> arr = pd.Series([1, 2])
285+
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
286+
array([ True, False])
287+
288+
An IndexError is raised when the lengths don't match.
289+
290+
>>> mask = pd.array([True, False, True])
291+
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
292+
Traceback (most recent call last):
293+
...
294+
IndexError: Item wrong length 3 instead of 2.
295+
296+
A ValueError is raised when the mask cannot be converted to
297+
a bool-dtype ndarray.
298+
299+
>>> mask = pd.array([True, pd.NA])
300+
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
301+
Traceback (most recent call last):
302+
...
303+
ValueError: cannot convert to bool numpy array in presence of missing values
304+
"""
305+
result = np.asarray(mask, dtype=bool)
306+
# GH26658
307+
if len(result) != len(array):
308+
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
309+
return result

pandas/core/indexing.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,11 @@
2222
from pandas.core.dtypes.missing import _infer_fill_value, isna
2323

2424
import pandas.core.common as com
25-
from pandas.core.indexers import is_list_like_indexer, length_of_indexer
25+
from pandas.core.indexers import (
26+
check_bool_array_indexer,
27+
is_list_like_indexer,
28+
length_of_indexer,
29+
)
2630
from pandas.core.indexes.api import Index, InvalidIndexError
2731

2832

@@ -2309,13 +2313,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
23092313
else:
23102314
if is_sparse(result):
23112315
result = result.to_dense()
2312-
result = np.asarray(result, dtype=bool)
2313-
2314-
# GH26658
2315-
if len(result) != len(index):
2316-
raise IndexError(
2317-
f"Item wrong length {len(result)} instead of {len(index)}."
2318-
)
2316+
result = check_bool_array_indexer(index, result)
23192317

23202318
return result
23212319

0 commit comments

Comments
 (0)