diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 266e5a015e408..30dbcfdffd61c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,6 @@ repos: hooks: - id: codespell types_or: [python, rst, markdown] - files: ^(pandas|doc)/ - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.2.0 hooks: diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 141142c2b3d97..20c0c0ea2f6fe 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -37,7 +37,7 @@ def setup(self): self.dict_list = frame.to_dict(orient="records") self.data2 = {i: {j: float(j) for j in range(100)} for i in range(2000)} - # arrays which we wont consolidate + # arrays which we won't consolidate self.dict_of_categoricals = {i: Categorical(np.arange(N)) for i in range(K)} def time_list_of_dict(self): @@ -60,7 +60,7 @@ def time_nested_dict_int64(self): DataFrame(self.data2) def time_dict_of_categoricals(self): - # dict of arrays that we wont consolidate + # dict of arrays that we won't consolidate DataFrame(self.dict_of_categoricals) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c8e15f2645e56..2de1f25fceace 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -527,7 +527,7 @@ def time_dtype_as_field(self, dtype, method, application, ncols): class GroupByCythonAgg: """ - Benchmarks specifically targetting our cython aggregation algorithms + Benchmarks specifically targeting our cython aggregation algorithms (using a big enough dataframe with simple key, so a large part of the time is actually spent in the grouped aggregation). """ diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index 4e3f938a33eb1..f041499c9c622 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -2,7 +2,7 @@ Benchmarks for code in pandas/_libs, excluding pandas/_libs/tslibs, which has its own directory. -If a PR does not edit anything in _libs/, then it is unlikely that thes +If a PR does not edit anything in _libs/, then it is unlikely that the benchmarks will be affected. """ import numpy as np diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index c4c50f5ca8eb5..8d4fc0240f2cc 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -50,7 +50,7 @@ def time_replace_list(self, inplace): self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) def time_replace_list_one_match(self, inplace): - # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + # the 1 can be held in self._df.blocks[0], while the inf and -inf can't self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index b42729476c818..89c627865049e 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -78,7 +78,7 @@ def time_stack(self, dtype): self.df.stack() def time_unstack_fast(self, dtype): - # last level -> doesnt have to make copies + # last level -> doesn't have to make copies self.ser.unstack("bar") def time_unstack_slow(self, dtype): diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 6cb1011e3c037..af10102749627 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -1,6 +1,6 @@ """ -Period benchmarks that rely only on tslibs. See benchmarks.period for -Period benchmarks that rely on other parts fo pandas. +Period benchmarks that rely only on tslibs. See benchmarks.period for +Period benchmarks that rely on other parts of pandas. """ import numpy as np diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index 6ed273281569b..2daf1861eb80a 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -1,6 +1,6 @@ """ -Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for -Timedelta benchmarks that rely on other parts fo pandas. +Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for +Timedelta benchmarks that rely on other parts of pandas. """ import datetime diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 0b2341bef413e..58187b3052819 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1762,7 +1762,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precendence." + "In the above case the text is blue because the selector `#T_b_ .cls-1` is worth 110 (ID plus class), which takes precedence." ] }, { diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7f5fe85e07f40..db785bd962f96 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1011,7 +1011,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): elif numeric_t is int64_t and is_datetimelike: na_val = NPY_NAT else: - # Will not be used, but define to avoid unitialized warning. + # Will not be used, but define to avoid uninitialized warning. na_val = 0 return na_val diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index cb2de79cd8b26..f843f6ccdfc58 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -349,7 +349,7 @@ cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns) if reso == NPY_DATETIMEUNIT.NPY_FR_ps: # pico is the smallest unit for which we don't overflow, so - # we exclude fempto and atto + # we exclude femto and atto day_units = 24 * 3600 * 1_000_000_000_000 elif reso == NPY_DATETIMEUNIT.NPY_FR_ns: day_units = 24 * 3600 * 1_000_000_000 diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8e459e8f2670d..028371633a2c1 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -202,7 +202,7 @@ def ints_to_pytimedelta(ndarray m8values, box=False): elif reso == NPY_DATETIMEUNIT.NPY_FR_W: res_val = timedelta(weeks=value) else: - # Month, Year, NPY_FR_GENERIC, pico, fempto, atto + # Month, Year, NPY_FR_GENERIC, pico, femto, atto raise NotImplementedError(reso) # Note: we can index result directly instead of using PyArray_MultiIter_DATA diff --git a/pandas/io/common.py b/pandas/io/common.py index bf1355d769758..5aecc55bb363a 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -590,7 +590,6 @@ def check_parent_directory(path: Path | str) -> None: ---------- path: Path or str Path to check parent directory of - """ parent = Path(path).parent if not parent.is_dir(): diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c98aedc9a2cf0..4c34b0c0aec0a 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -610,7 +610,7 @@ def test_to_datetime_YYYYMMDD(self): actual = to_datetime("20080115") assert actual == datetime(2008, 1, 15) - def test_to_datetime_unparseable_ignore(self): + def test_to_datetime_unparsable_ignore(self): # unparsable ser = "Month 1, 1999" assert to_datetime(ser, errors="ignore") == ser diff --git a/setup.cfg b/setup.cfg index fdd6bdd9d579f..d3c4fe0cb35ce 100644 --- a/setup.cfg +++ b/setup.cfg @@ -160,7 +160,7 @@ exclude = [codespell] ignore-words-list = ba,blocs,coo,hist,nd,sav,ser -ignore-regex = https://(\w+\.)+ +ignore-regex = https://([\w/\.])+ [coverage:run] branch = True diff --git a/versioneer.py b/versioneer.py index 68c9bb161f206..c98dbd83271d7 100644 --- a/versioneer.py +++ b/versioneer.py @@ -691,7 +691,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? + # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces @@ -1105,7 +1105,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: - # unparseable. Maybe git-describe is misbehaving? + # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces diff --git a/web/pandas/community/blog/2019-user-survey.md b/web/pandas/community/blog/2019-user-survey.md index 73c426e7cbec9..312ee49bdf387 100644 --- a/web/pandas/community/blog/2019-user-survey.md +++ b/web/pandas/community/blog/2019-user-survey.md @@ -26,11 +26,11 @@ This analysis and the raw data can be found [on GitHub](https://github.com/panda [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/pandas-dev/pandas-user-surveys/master?filepath=2019.ipynb) -We had about 1250 repsonses over the 15 days we ran the survey in the summer of 2019. +We had about 1250 responses over the 15 days we ran the survey in the summer of 2019. ## About the Respondents -There was a fair amount of representation across pandas experience and frequeny of use, though the majority of respondents are on the more experienced side. +There was a fair amount of representation across pandas experience and frequency of use, though the majority of respondents are on the more experienced side. @@ -101,7 +101,7 @@ CSV and Excel are (for better or worse) the most popular formats. ![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_18_0.png) -In preperation for a possible refactor of pandas internals, we wanted to get a sense for +In preparation for a possible refactor of pandas internals, we wanted to get a sense for how common wide (100s of columns or more) DataFrames are. @@ -109,7 +109,7 @@ how common wide (100s of columns or more) DataFrames are. ![png]({{ base_url }}/static/img/blog/2019-user-survey/2019_20_0.png) -Pandas is slowly growing new exentension types. Categoricals are the most popular, +Pandas is slowly growing new extension types. Categoricals are the most popular, and the nullable integer type is already almost as popular as datetime with timezone. @@ -139,7 +139,7 @@ Of these, the clear standout is "scaling" to large datasets. A couple observatio 1. Perhaps pandas' documentation should do a better job of promoting libraries that provide scalable dataframes (like [Dask](https://dask.org), [vaex](https://dask.org), and [modin](https://modin.readthedocs.io/en/latest/)) 2. Memory efficiency (perhaps from a native string data type, fewer internal copies, etc.) is a valuable goal. -After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilites with the rest of pandas API. +After that, the next-most critical improvement is integer missing values. Those were actually added in [Pandas 0.24](https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html#optional-integer-na-support), but they're not the default, and there's still some incompatibilities with the rest of pandas API. Pandas is a less conservative library than, say, NumPy. We're approaching 1.0, but on the way we've made many deprecations and some outright API breaking changes. Fortunately, most people are OK with the tradeoff. diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 3e35b3ac4ea30..1d77c596c1eb0 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -177,7 +177,7 @@ users to view, manipulate and edit pandas `Index`, `Series`, and `DataFrame` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. Pandas objects can also be renamed, duplicated, new -columns added, copyed/pasted to/from the clipboard (as TSV), and +columns added, copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. @@ -379,8 +379,8 @@ A directory of projects providing `extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. - | Library | Accessor | Classes | - | ---------------------------------------------------------------------|------------|-----------------------| + | Library | Accessor | Classes | + | -------------------------------------------------------------------- | ---------- | --------------------- | | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas-genomics](https://pandas-genomics.readthedocs.io/en/latest/) | `genomics` | `Series`, `DataFrame` |