-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
/
Copy pathvalidate_rst_title_capitalization.py
executable file
·206 lines (164 loc) · 5.36 KB
/
validate_rst_title_capitalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python
"""
Validate that the titles in the rst files follow the proper capitalization convention.
Print the titles that do not follow the convention.
Usage::
./scripts/validate_rst_title_capitalization.py doc/source/development/contributing.rst
./scripts/validate_rst_title_capitalization.py doc/source/
"""
import argparse
import sys
import re
import os
from typing import Tuple, Generator, List
import glob
CAPITALIZATION_EXCEPTIONS = {
"pandas",
"Python",
"IPython",
"PyTables",
"Excel",
"JSON",
"HTML",
"SAS",
"SQL",
"BigQuery",
"STATA",
"Interval",
"PEP8",
"Period",
"Series",
"Index",
"DataFrame",
"C",
"Git",
"GitHub",
"NumPy",
"Apache",
"Arrow",
"Parquet",
"MultiIndex",
"NumFOCUS",
"sklearn",
"Docker",
}
CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS}
err_msg = "Heading capitalization formatted incorrectly. Please correctly capitalize"
symbols = ("*", "=", "-", "^", "~", "#", '"')
def correct_title_capitalization(title: str) -> str:
"""
Algorithm to create the correct capitalization for a given title.
Parameters
----------
title : str
Heading string to correct.
Returns
-------
str
Correctly capitalized heading.
"""
# Strip all non-word characters from the beginning of the title to the
# first word character.
correct_title: str = re.sub(r"^\W*", "", title).capitalize()
# Remove a URL from the title. We do this because words in a URL must
# stay lowercase, even if they are a capitalization exception.
removed_https_title = re.sub(r"<https?:\/\/.*[\r\n]*>", "", correct_title)
# Split a title into a list using non-word character delimiters.
word_list = re.split(r"\W", removed_https_title)
for word in word_list:
if word.lower() in CAP_EXCEPTIONS_DICT:
correct_title = re.sub(
rf"\b{word}\b", CAP_EXCEPTIONS_DICT[word.lower()], correct_title
)
return correct_title
def find_titles(rst_file: str) -> Generator[Tuple[str, int], None, None]:
"""
Algorithm to identify particular text that should be considered headings in an
RST file.
See <https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html> for details
on what constitutes a string as a heading in RST.
Parameters
----------
rst_file : str
RST file to scan through for headings.
Yields
-------
title : str
A heading found in the rst file.
line_number : int
The corresponding line number of the heading.
"""
with open(rst_file, "r") as fd:
previous_line = ""
for i, line in enumerate(fd):
line = line[:-1]
line_chars = set(line)
if (
len(line_chars) == 1
and line_chars.pop() in symbols
and len(line) == len(previous_line)
):
yield re.sub(r"[`\*_]", "", previous_line), i
previous_line = line
def find_rst_files(source_paths: List[str]) -> Generator[str, None, None]:
"""
Given the command line arguments of directory paths, this method
yields the strings of the .rst file directories that these paths contain.
Parameters
----------
source_paths : str
List of directories to validate, provided through command line arguments.
Yields
-------
str
Directory address of a .rst files found in command line argument directories.
"""
for directory_address in source_paths:
if not os.path.exists(directory_address):
raise ValueError(
"Please enter a valid path, pointing to a valid file/directory."
)
elif directory_address.endswith(".rst"):
yield directory_address
else:
for filename in glob.glob(
pathname=f"{directory_address}/**/*.rst", recursive=True
):
yield filename
def main(source_paths: List[str], output_format: str) -> bool:
"""
The main method to print all headings with incorrect capitalization.
Parameters
----------
source_paths : str
List of directories to validate, provided through command line arguments.
output_format : str
Output format of the script.
Returns
-------
int
Number of incorrect headings found overall.
"""
number_of_errors: int = 0
for filename in find_rst_files(source_paths):
for title, line_number in find_titles(filename):
if title != correct_title_capitalization(title):
print(
f"""{filename}:{line_number}:{err_msg} "{title}" to "{
correct_title_capitalization(title)}" """
)
number_of_errors += 1
return number_of_errors
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Validate heading capitalization")
parser.add_argument(
"paths", nargs="+", default=".", help="Source paths of file/directory to check."
)
parser.add_argument(
"--format",
"-f",
default="{source_path}:{line_number}:{msg}:{heading}:{correct_heading}",
help="Output format of incorrectly capitalized titles",
)
args = parser.parse_args()
sys.exit(main(args.paths, args.format))