Fix typos (pandas-dev#47275)

janosh · MarcoGorelli · web-flow · commit aa85f02acb7b · 2022-06-08T18:02:17.000+01:00
* fix codespell violations

* drop rule files: ^(pandas|doc)/ from codespell pre-commit config

also fix typos fempto -&gt; femto

* improve codespell ignore-regex in setup.cfg

Co-authored-by: Marco Edward Gorelli &lt;marcogorelli@protonmail.com&gt;

Co-authored-by: Marco Edward Gorelli &lt;marcogorelli@protonmail.com&gt;
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -26,7 +26,6 @@ repos:
     hooks:
     -   id: codespell
         types_or: [python, rst, markdown]
-        files: ^(pandas|doc)/
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.2.0
     hooks:
diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py
@@ -37,7 +37,7 @@ def setup(self):
         self.dict_list = frame.to_dict(orient="records")
         self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)}
 
-        # arrays which we wont consolidate
+        # arrays which we won't consolidate
         self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)}
 
     def time_list_of_dict(self):
@@ -60,7 +60,7 @@ def time_nested_dict_int64(self):
         DataFrame(self.data2)
 
     def time_dict_of_categoricals(self):
-        # dict of arrays that we wont consolidate
+        # dict of arrays that we won't consolidate
         DataFrame(self.dict_of_categoricals)
 
 
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
@@ -527,7 +527,7 @@ def time_dtype_as_field(self, dtype, method, application, ncols):
 
 class GroupByCythonAgg:
     """
-    Benchmarks specifically targetting our cython aggregation algorithms
+    Benchmarks specifically targeting our cython aggregation algorithms
     (using a big enough dataframe with simple key, so a large part of the
     time is actually spent in the grouped aggregation).
     """
diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py
@@ -2,7 +2,7 @@
 Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs,
 which has its own directory.
 
-If a PR does not edit anything in _libs/, then it is unlikely that thes
+If a PR does not edit anything in _libs/, then it is unlikely that the
 benchmarks will be affected.
 """
 import numpy as np
diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py
@@ -50,7 +50,7 @@ def time_replace_list(self, inplace):
         self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace)
 
     def time_replace_list_one_match(self, inplace):
-        # the 1 can be held in self._df.blocks[0], while the inf and -inf cant
+        # the 1 can be held in self._df.blocks[0], while the inf and -inf can't
         self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace)
 
 
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -78,7 +78,7 @@ def time_stack(self, dtype):
         self.df.stack()
 
     def time_unstack_fast(self, dtype):
-        # last level -> doesnt have to make copies
+        # last level -> doesn't have to make copies
         self.ser.unstack("bar")
 
     def time_unstack_slow(self, dtype):
diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py
@@ -1,6 +1,6 @@
 """
-Period benchmarks that rely only on tslibs.  See benchmarks.period for
-Period benchmarks that rely on other parts fo pandas.
+Period benchmarks that rely only on tslibs. See benchmarks.period for
+Period benchmarks that rely on other parts of pandas.
 """
 
 import numpy as np
diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py
@@ -1,6 +1,6 @@
 """
-Timedelta benchmarks that rely only on tslibs.  See benchmarks.timedeltas for
-Timedelta benchmarks that rely on other parts fo pandas.
+Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for
+Timedelta benchmarks that rely on other parts of pandas.
 """
 import datetime
 
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
@@ -1762,7 +1762,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precendence."
+    "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precedence."
    ]
   },
   {
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -1011,7 +1011,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
     elif numeric_t is int64_t and is_datetimelike:
         na_val = NPY_NAT
     else:
-        # Will not be used, but define to avoid unitialized warning.
+        # Will not be used, but define to avoid uninitialized warning.
         na_val = 0
     return na_val
 
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -349,7 +349,7 @@ cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns)
 
     if reso == NPY_DATETIMEUNIT.NPY_FR_ps:
         # pico is the smallest unit for which we don't overflow, so
-        #  we exclude fempto and atto
+        #  we exclude femto and atto
         day_units = 24 * 3600 * 1_000_000_000_000
     elif reso == NPY_DATETIMEUNIT.NPY_FR_ns:
         day_units = 24 * 3600 * 1_000_000_000
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -202,7 +202,7 @@ def ints_to_pytimedelta(ndarray m8values, box=False):
             elif reso == NPY_DATETIMEUNIT.NPY_FR_W:
                 res_val = timedelta(weeks=value)
             else:
-                # Month, Year, NPY_FR_GENERIC, pico, fempto, atto
+                # Month, Year, NPY_FR_GENERIC, pico, femto, atto
                 raise NotImplementedError(reso)
 
         # Note: we can index result directly instead of using PyArray_MultiIter_DATA
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -590,7 +590,6 @@ def check_parent_directory(path: Path | str) -> None:
     ----------
     path: Path or str
         Path to check parent directory of
-
     """
     parent = Path(path).parent
     if not parent.is_dir():
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
@@ -610,7 +610,7 @@ def test_to_datetime_YYYYMMDD(self):
         actual = to_datetime("20080115")
         assert actual == datetime(2008, 1, 15)
 
-    def test_to_datetime_unparseable_ignore(self):
+    def test_to_datetime_unparsable_ignore(self):
         # unparsable
         ser = "Month 1, 1999"
         assert to_datetime(ser, errors="ignore") == ser
diff --git a/setup.cfg b/setup.cfg
@@ -160,7 +160,7 @@ exclude =
 
 [codespell]
 ignore-words-list = ba,blocs,coo,hist,nd,sav,ser
-ignore-regex = https://(\w+\.)+
+ignore-regex = https://([\w/\.])+
 
 [coverage:run]
 branch = True
diff --git a/versioneer.py b/versioneer.py
@@ -691,7 +691,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         # TAG-NUM-gHEX
         mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
         if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
+            # unparsable. Maybe git-describe is misbehaving?
             pieces["error"] = ("unable to parse git-describe output: '%%s'"
                                %% describe_out)
             return pieces
@@ -1105,7 +1105,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         # TAG-NUM-gHEX
         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
+            # unparsable. Maybe git-describe is misbehaving?
             pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
diff --git a/web/pandas/community/blog/2019-user-survey.md b/web/pandas/community/blog/2019-user-survey.md
@@ -26,11 +26,11 @@ This analysis and the raw data can be found [on GitHub](https://github.com/panda
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pandas-dev/pandas-user-surveys/master?filepath=2019.ipynb)
 
 
-We had about 1250 repsonses over the 15 days we ran the survey in the summer of 2019.
+We had about 1250 responses over the 15 days we ran the survey in the summer of 2019.
 
 ## About the Respondents
 
-There was a fair amount of representation across pandas experience and frequeny of use, though the majority of respondents are on the more experienced side.
+There was a fair amount of representation across pandas experience and frequency of use, though the majority of respondents are on the more experienced side.
 
 
 
@@ -101,15 +101,15 @@ CSV and Excel are (for better or worse) the most popular formats.
 ![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_18_0.png)
 
 
-In preperation for a possible refactor of pandas internals, we wanted to get a sense for
+In preparation for a possible refactor of pandas internals, we wanted to get a sense for
 how common wide (100s of columns or more) DataFrames are.
 
 
 
 ![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_20_0.png)
 
 
-Pandas is slowly growing new exentension types. Categoricals are the most popular,
+Pandas is slowly growing new extension types. Categoricals are the most popular,
 and the nullable integer type is already almost as popular as datetime with timezone.
 
 
@@ -139,7 +139,7 @@ Of these, the clear standout is "scaling" to large datasets. A couple observatio
 1. Perhaps pandas' documentation should do a better job of promoting libraries that provide scalable dataframes (like [Dask](https://dask.org), [vaex](https://dask.org), and [modin](https://modin.readthedocs.io/en/latest/))
 2. Memory efficiency (perhaps from a native string data type, fewer internal copies, etc.) is a valuable goal.
 
-After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilites with the rest of pandas API.
+After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilities with the rest of pandas API.
 
 Pandas is a less conservative library than, say, NumPy. We're approaching 1.0, but on the way we've made many deprecations and some outright API breaking changes. Fortunately, most people are OK with the tradeoff.
 
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
@@ -177,7 +177,7 @@ users to view, manipulate and edit pandas `Index`, `Series`, and
 `DataFrame` objects like a "spreadsheet", including copying and
 modifying values, sorting, displaying a "heatmap", converting data
 types and more. Pandas objects can also be renamed, duplicated, new
-columns added, copyed/pasted to/from the clipboard (as TSV), and
+columns added, copied/pasted to/from the clipboard (as TSV), and
 saved/loaded to/from a file. Spyder can also import data from a variety
 of plain text and binary files or the clipboard into a new pandas
 DataFrame via a sophisticated import wizard.
@@ -379,8 +379,8 @@ A directory of projects providing
 `extension accessors <extending.register-accessors>`. This is for users to discover new accessors and for library
 authors to coordinate on the namespace.
 
-  | Library                                                              | Accessor   |  Classes              |
-  | ---------------------------------------------------------------------|------------|-----------------------|
+  | Library                                                              | Accessor   | Classes               |
+  | -------------------------------------------------------------------- | ---------- | --------------------- |
   | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest)          | `ip`       | `Series`              |
   | [pdvega](https://altair-viz.github.io/pdvega/)                       | `vgplot`   | `Series`, `DataFrame` |
   | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` |

Original file line number	Diff line number	Diff line change
`@@ -1762,7 +1762,7 @@`
`1762`	`1762`	`"cell_type": "markdown",`
`1763`	`1763`	`"metadata": {},`
`1764`	`1764`	`"source": [`
`1765`		- "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precendence."
	`1765`	+ "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precedence."
`1766`	`1766`	`]`
`1767`	`1767`	`},`
`1768`	`1768`	`{`