Skip to content

Commit fb04a42

Browse files
authored
STYLE use pd_array in core (#40319)
1 parent ae8849c commit fb04a42

File tree

5 files changed

+113
-2
lines changed

5 files changed

+113
-2
lines changed

.pre-commit-config.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,10 @@ repos:
212212
|\#\ type:\s?ignore(?!\[)
213213
language: pygrep
214214
types: [python]
215+
- id: use-pd_array-in-core
216+
name: Import pandas.array as pd_array in core
217+
language: python
218+
entry: python scripts/use_pd_array_in_core.py
219+
files: ^pandas/core/
220+
exclude: ^pandas/core/api\.py$
221+
types: [python]

pandas/core/strings/accessor.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3023,7 +3023,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
30233023
"""
30243024
from pandas import (
30253025
DataFrame,
3026-
array,
3026+
array as pd_array,
30273027
)
30283028

30293029
regex = re.compile(pat, flags=flags)
@@ -3034,7 +3034,7 @@ def _str_extract_noexpand(arr, pat, flags=0):
30343034
result = np.array([groups_or_na(val)[0] for val in arr], dtype=object)
30353035
name = _get_single_group_name(regex)
30363036
# not dispatching, so we have to reconstruct here.
3037-
result = array(result, dtype=result_dtype)
3037+
result = pd_array(result, dtype=result_dtype)
30383038
else:
30393039
if isinstance(arr, ABCIndex):
30403040
raise ValueError("only one regex group is supported with Index")
+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pytest
2+
3+
from scripts.use_pd_array_in_core import use_pd_array
4+
5+
BAD_FILE_0 = "import pandas as pd\npd.array"
6+
BAD_FILE_1 = "\nfrom pandas import array"
7+
GOOD_FILE_0 = "from pandas import array as pd_array"
8+
GOOD_FILE_1 = "from pandas.core.construction import array as pd_array"
9+
PATH = "t.py"
10+
11+
12+
@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1])
13+
def test_inconsistent_usage(content, capsys):
14+
result_msg = (
15+
"t.py:2:0: Don't use pd.array in core, import array as pd_array instead\n"
16+
)
17+
with pytest.raises(SystemExit):
18+
use_pd_array(content, PATH)
19+
expected_msg, _ = capsys.readouterr()
20+
assert result_msg == expected_msg
21+
22+
23+
@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1])
24+
def test_consistent_usage(content):
25+
# should not raise
26+
use_pd_array(content, PATH)

scripts/use_pd_array_in_core.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Check that pandas/core imports pandas.array as pd_array.
3+
4+
This makes it easier to grep for usage of pandas array.
5+
6+
This is meant to be run as a pre-commit hook - to run it manually, you can do:
7+
8+
pre-commit run use-pd_array-in-core --all-files
9+
10+
"""
11+
12+
import argparse
13+
import ast
14+
import sys
15+
from typing import (
16+
Optional,
17+
Sequence,
18+
)
19+
20+
ERROR_MESSAGE = (
21+
"{path}:{lineno}:{col_offset}: "
22+
"Don't use pd.array in core, import array as pd_array instead\n"
23+
)
24+
25+
26+
class Visitor(ast.NodeVisitor):
27+
def __init__(self, path: str) -> None:
28+
self.path = path
29+
30+
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
31+
# If array has been imported from somewhere in pandas,
32+
# check it's aliased as pd_array.
33+
if (
34+
node.module is not None
35+
and node.module.startswith("pandas")
36+
and any(i.name == "array" and i.asname != "pd_array" for i in node.names)
37+
):
38+
msg = ERROR_MESSAGE.format(
39+
path=self.path, lineno=node.lineno, col_offset=node.col_offset
40+
)
41+
sys.stdout.write(msg)
42+
sys.exit(1)
43+
super().generic_visit(node)
44+
45+
def visit_Attribute(self, node: ast.Attribute) -> None:
46+
if (
47+
isinstance(node.value, ast.Name)
48+
and node.value.id == "pd"
49+
and node.attr == "array"
50+
):
51+
msg = ERROR_MESSAGE.format(
52+
path=self.path, lineno=node.lineno, col_offset=node.col_offset
53+
)
54+
sys.stdout.write(msg)
55+
sys.exit(1)
56+
super().generic_visit(node)
57+
58+
59+
def use_pd_array(content: str, path: str) -> None:
60+
tree = ast.parse(content)
61+
visitor = Visitor(path)
62+
visitor.visit(tree)
63+
64+
65+
def main(argv: Optional[Sequence[str]] = None) -> None:
66+
parser = argparse.ArgumentParser()
67+
parser.add_argument("paths", nargs="*")
68+
args = parser.parse_args(argv)
69+
70+
for path in args.paths:
71+
with open(path, encoding="utf-8") as fd:
72+
content = fd.read()
73+
use_pd_array(content, path)
74+
75+
76+
if __name__ == "__main__":
77+
main()

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ omit =
140140
pandas/_typing.py
141141
pandas/_version.py
142142
plugins = Cython.Coverage
143+
source = pandas
143144

144145
[coverage:report]
145146
ignore_errors = False

0 commit comments

Comments
 (0)