diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 0d9e3ade08846..0000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,22 +0,0 @@ -version: 2.1 - -jobs: - test-arm: - machine: - image: ubuntu-2004:202101-01 - resource_class: arm.medium - environment: - ENV_FILE: ci/deps/circle-38-arm64.yaml - PYTEST_WORKERS: auto - PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" - PYTEST_TARGET: "pandas" - PANDAS_CI: "1" - steps: - - checkout - - run: .circleci/setup_env.sh - - run: PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH ci/run_tests.sh - -workflows: - test: - jobs: - - test-arm diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh deleted file mode 100755 index c03a7ff4be8b3..0000000000000 --- a/.circleci/setup_env.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/bin/bash -e - -# edit the locale file if needed -if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then - echo "Adding locale to the first line of pandas/__init__.py" - rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" - sed -i "$SEDC" pandas/__init__.py - - echo "[head -4 pandas/__init__.py]" - head -4 pandas/__init__.py - echo -fi - - -MINICONDA_DIR=/usr/local/miniconda -if [ -e $MINICONDA_DIR ] && [ "$BITS32" != yes ]; then - echo "Found Miniconda installation at $MINICONDA_DIR" -else - echo "Install Miniconda" - DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" - if [[ "$(uname -m)" == 'aarch64' ]]; then - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" - elif [[ "$(uname)" == 'Linux' ]]; then - if [[ "$BITS32" == "yes" ]]; then - CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" - else - CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" - fi - elif [[ "$(uname)" == 'Darwin' ]]; then - CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" - else - echo "OS $(uname) not supported" - exit 1 - fi - echo "Downloading $CONDA_URL" - wget -q $CONDA_URL -O miniconda.sh - chmod +x miniconda.sh - - MINICONDA_DIR="$HOME/miniconda3" - rm -rf $MINICONDA_DIR - ./miniconda.sh -b -p $MINICONDA_DIR -fi -export PATH=$MINICONDA_DIR/bin:$PATH - -echo -echo "which conda" -which conda - -echo -echo "update conda" -conda config --set ssl_verify false -conda config --set quiet true --set always_yes true --set changeps1 false -conda install -y -c conda-forge -n base 'mamba>=0.21.2' pip setuptools - -echo "conda info -a" -conda info -a - -echo "conda list (root environment)" -conda list - -echo -# Clean up any left-over from a previous build -mamba env remove -n pandas-dev -echo "mamba env update --file=${ENV_FILE}" -# See https://github.com/mamba-org/mamba/issues/633 -mamba create -q -n pandas-dev -time mamba env update -n pandas-dev --file="${ENV_FILE}" - -echo "conda list -n pandas-dev" -conda list -n pandas-dev - -if [[ "$BITS32" == "yes" ]]; then - # activate 32-bit compiler - export CONDA_BUILD=1 -fi - -echo "activate pandas-dev" -source activate pandas-dev - -# Explicitly set an environment variable indicating that this is pandas' CI environment. -# -# This allows us to enable things like -Werror that shouldn't be activated in -# downstream CI jobs that may also build pandas from source. -export PANDAS_CI=1 - -if pip list | grep -q ^pandas; then - echo - echo "remove any installed pandas package w/o removing anything else" - pip uninstall -y pandas || true -fi - -if [ "$(conda list -f qt --json)" != [] ]; then - echo - echo "remove qt" - echo "causes problems with the clipboard, we use xsel for that" - conda remove qt -y --force || true -fi - -echo "Build extensions" -python setup.py build_ext -q -j3 - -echo "Install pandas" -python -m pip install --no-build-isolation --no-use-pep517 -e . - -echo "done" diff --git a/.devcontainer.json b/.devcontainer.json index 8bea96aea29c1..54ddfa1a130f8 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -8,9 +8,7 @@ // Use 'settings' to set *default* container specific settings.json values on container create. // You can edit these settings after create using File > Preferences > Settings > Remote. "settings": { - "terminal.integrated.shell.linux": "/bin/bash", - "python.condaPath": "/opt/conda/bin/conda", - "python.pythonPath": "/opt/conda/bin/python", + "python.pythonPath": "/usr/local/bin/python", "python.formatting.provider": "black", "python.linting.enabled": true, "python.linting.flake8Enabled": true, diff --git a/.gitattributes b/.gitattributes index 736fa09d070fe..d94c19e7edb1f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -14,3 +14,75 @@ *.xls binary *.xlsx binary pandas/_version.py export-subst + + +*.bz2 export-ignore +*.csv export-ignore +*.data export-ignore +*.dta export-ignore +*.feather export-ignore +*.tar export-ignore +*.gz export-ignore +*.h5 export-ignore +*.html export-ignore +*.json export-ignore +*.jsonl export-ignore +*.kml export-ignore +*.msgpack export-ignore +*.pdf export-ignore +*.parquet export-ignore +*.pickle export-ignore +*.pkl export-ignore +*.png export-ignore +*.pptx export-ignore +*.ods export-ignore +*.odt export-ignore +*.orc export-ignore +*.sas7bdat export-ignore +*.sav export-ignore +*.so export-ignore +*.txt export-ignore +*.xls export-ignore +*.xlsb export-ignore +*.xlsm export-ignore +*.xlsx export-ignore +*.xpt export-ignore +*.cpt export-ignore +*.xml export-ignore +*.xsl export-ignore +*.xz export-ignore +*.zip export-ignore +*.zst export-ignore +*~ export-ignore +.DS_Store export-ignore +.git* export-ignore + +*.py[ocd] export-ignore +*.pxi export-ignore + +# Ignoring stuff from the top level +.github export-ignore +asv_bench export-ignore +ci export-ignore +doc export-ignore +gitpod export-ignore +MANIFEST.in export-ignore +scripts/** export-ignore +typings export-ignore +web export-ignore +CITATION.cff export-ignore +codecov.yml export-ignore +Dockerfile export-ignore +environment.yml export-ignore +setup.py export-ignore + + +# GH 39321 +# csv_dir_path fixture checks the existence of the directory +# exclude the whole directory to avoid running related tests in sdist +pandas/tests/io/parser/data export-ignore + +# Include cibw script in sdist since it's needed for building wheels +scripts/cibw_before_build.sh -export-ignore +scripts/cibw_before_build_windows.sh -export-ignore +scripts/cibw_before_test_windows.sh -export-ignore diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000000..39f52bb3edd8e --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,20 @@ +# github +.github/ @mroeschke + +# ci +ci/ @mroeschke + +# docs +doc/cheatsheet @Dr-Irv +doc/source/development @noatamir + +# pandas +pandas/_libs/ @WillAyd +pandas/_libs/tslibs/* @MarcoGorelli +pandas/_typing.py @Dr-Irv +pandas/core/groupby/* @rhshadrach +pandas/core/tools/datetimes.py @MarcoGorelli +pandas/io/excel/* @rhshadrach +pandas/io/formats/style.py @attack68 +pandas/io/formats/style_render.py @attack68 +pandas/io/formats/templates @attack68 diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml index 36bc8dcf02bae..4e1bc8f61d04e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -17,7 +17,9 @@ body: [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. required: true - label: > - I have confirmed this bug exists on the main branch of pandas. + I have confirmed this bug exists on the + [main branch](https://pandas.pydata.org/docs/dev/getting_started/install.html#installing-the-development-version-of-pandas) + of pandas. - type: textarea id: example attributes: diff --git a/.github/ISSUE_TEMPLATE/feature_request.yaml b/.github/ISSUE_TEMPLATE/feature_request.yaml index f837eb1ca5bb7..6e6cd78ace11d 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yaml +++ b/.github/ISSUE_TEMPLATE/feature_request.yaml @@ -68,5 +68,5 @@ body: attributes: label: Additional Context description: > - Please provide any relevant Github issues, code examples or references that help describe and support + Please provide any relevant GitHub issues, code examples or references that help describe and support the feature request. diff --git a/.github/ISSUE_TEMPLATE/pdep_vote.yaml b/.github/ISSUE_TEMPLATE/pdep_vote.yaml new file mode 100644 index 0000000000000..6dcbd76eb0f74 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/pdep_vote.yaml @@ -0,0 +1,74 @@ +name: PDEP Vote +description: Call for a vote on a PDEP +title: "VOTE: " +labels: [Vote] + +body: + - type: markdown + attributes: + value: > + As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a + maintainer has opened a PDEP discussion and is ready to call for a vote. + - type: checkboxes + attributes: + label: Locked issue + options: + - label: > + I locked this voting issue so that only voting members are able to cast their votes or + comment on this issue. + required: true + - type: input + id: PDEP-name + attributes: + label: PDEP number and title + placeholder: > + PDEP-1: Purpose and guidelines + validations: + required: true + - type: input + id: PDEP-link + attributes: + label: Pull request with discussion + description: e.g. https://github.com/pandas-dev/pandas/pull/47444 + validations: + required: true + - type: input + id: PDEP-rendered-link + attributes: + label: Rendered PDEP for easy reading + description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041 + validations: + required: true + - type: input + id: PDEP-number-of-discussion-participants + attributes: + label: Discussion participants + description: > + You may find it useful to list or total the number of participating members in the + PDEP discussion PR. This would be the maximum possible disapprove votes. + placeholder: > + 14 voting members participated in the PR discussion thus far. + - type: input + id: PDEP-vote-end + attributes: + label: Voting will close in 15 days. + description: The voting period end date. ('Voting will close in 15 days.' will be automatically written) + - type: markdown + attributes: + value: --- + - type: textarea + id: Vote + attributes: + label: Vote + value: | + Cast your vote in a comment below. + * +1: approve. + * 0: abstain. + * Reason: A one sentence reason is required. + * -1: disapprove + * Reason: A one sentence reason is required. + A disapprove vote requires prior participation in the linked discussion PR. + + @pandas-dev/pandas-core + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 876e5e2cfbb1e..8eca91c692710 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ -- [ ] closes #xxxx (Replace xxxx with the Github issue number) +- [ ] closes #xxxx (Replace xxxx with the GitHub issue number) - [ ] [Tests added and passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#writing-tests) if fixing a bug or adding a new feature - [ ] All [code checks passed](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#pre-commit). - [ ] Added [type annotations](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#type-hints) to new arguments/methods/functions. diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 23bb988ef4d73..b92bacd1a537c 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -1,5 +1,9 @@ name: Build pandas description: Rebuilds the C extensions and installs pandas +inputs: + editable: + description: Whether to build pandas in editable mode (default true) + default: true runs: using: composite steps: @@ -8,15 +12,23 @@ runs: run: | micromamba info micromamba list + pip list --pre + shell: bash -el {0} + + - name: Uninstall existing Pandas installation + run: | + if pip show pandas 1>/dev/null; then + pip uninstall -y pandas + fi shell: bash -el {0} - name: Build Pandas run: | - python setup.py build_ext -j $N_JOBS - python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + if [[ ${{ inputs.editable }} == "true" ]]; then + pip install -e . --no-build-isolation -v --no-deps \ + -Csetup-args="--werror" + else + pip install . --no-build-isolation -v --no-deps \ + -Csetup-args="--werror" + fi shell: bash -el {0} - env: - # Cannot use parallel compilation on Windows, see https://github.com/pandas-dev/pandas/issues/30873 - # GH 47305: Parallel build causes flaky ImportError: /home/runner/work/pandas/pandas/pandas/_libs/tslibs/timestamps.cpython-38-x86_64-linux-gnu.so: undefined symbol: pandas_datetime_to_datetimestruct - N_JOBS: 1 - #N_JOBS: ${{ runner.os == 'Windows' && 1 || 2 }} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 2a7601f196ec4..b60245d20e8e4 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,19 +7,14 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: Test results path: test-data.xml if: failure() - - name: Report Coverage - run: coverage report -m - shell: bash -el {0} - if: failure() - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v5 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 002d0020c2df1..a09ac1a4e5ffb 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -3,34 +3,20 @@ inputs: environment-file: description: Conda environment file to use. default: environment.yml - environment-name: - description: Name to use for the Conda environment - default: test - extra-specs: - description: Extra packages to install - required: false - pyarrow-version: - description: If set, overrides the PyArrow version in the Conda environment to the given string. - required: false runs: using: composite steps: - - name: Set Arrow version in ${{ inputs.environment-file }} to ${{ inputs.pyarrow-version }} - run: | - grep -q ' - pyarrow' ${{ inputs.environment-file }} - sed -i"" -e "s/ - pyarrow/ - pyarrow=${{ inputs.pyarrow-version }}/" ${{ inputs.environment-file }} - cat ${{ inputs.environment-file }} - shell: bash - if: ${{ inputs.pyarrow-version }} - - name: Install ${{ inputs.environment-file }} - uses: mamba-org/provision-with-micromamba@v12 + uses: mamba-org/setup-micromamba@v2 with: environment-file: ${{ inputs.environment-file }} - environment-name: ${{ inputs.environment-name }} - extra-specs: ${{ inputs.extra-specs }} - channels: conda-forge - channel-priority: ${{ runner.os == 'macOS' && 'flexible' || 'strict' }} - condarc-file: ci/condarc.yml - cache-env: true + environment-name: test + condarc-file: ci/.condarc + cache-environment: true cache-downloads: true + + - name: Uninstall pyarrow + if: ${{ env.REMOVE_PYARROW == '1' }} + run: | + micromamba remove -y pyarrow + shell: bash -el {0} diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000000..784206dfe67ff --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + labels: + - "CI" + - "Dependencies" diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml deleted file mode 100644 index e091160c952f8..0000000000000 --- a/.github/workflows/32-bit-linux.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: 32 Bit Linux - -on: - push: - branches: - - main - - 1.4.x - pull_request: - branches: - - main - - 1.4.x - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Run 32-bit manylinux2014 Docker Build / Tests - run: | - # Without this (line 34), versioneer will not be able to determine the pandas version. - # This is because of a security update to git that blocks it from reading the config folder if - # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the - # Docker container. - # xref https://github.com/pypa/manylinux/issues/1309 - docker pull quay.io/pypa/manylinux2014_i686 - docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - git config --global --add safe.directory /pandas && \ - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - pip install cython numpy python-dateutil pytz pytest pytest-xdist pytest-asyncio>=0.17 hypothesis && \ - python setup.py build_ext -q -j2 && \ - python -m pip install --no-build-isolation --no-use-pep517 -e . && \ - export PANDAS_CI=1 && \ - pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" - - - name: Publish test results for Python 3.8-32 bit full Linux - uses: actions/upload-artifact@v3 - with: - name: Test results - path: test-data.xml - if: failure() diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml deleted file mode 100644 index b7bb8db549f86..0000000000000 --- a/.github/workflows/assign.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: Assign -on: - issue_comment: - types: created - -permissions: - contents: read - -jobs: - issue_assign: - permissions: - issues: write - pull-requests: write - runs-on: ubuntu-latest - steps: - - if: github.event.comment.body == 'take' - run: | - echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees diff --git a/.github/workflows/autoupdate-pre-commit-config.yml b/.github/workflows/autoupdate-pre-commit-config.yml deleted file mode 100644 index 9a41871c26062..0000000000000 --- a/.github/workflows/autoupdate-pre-commit-config.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: "Update pre-commit config" - -on: - schedule: - - cron: "0 7 1 * *" # At 07:00 on 1st of every month. - workflow_dispatch: - -permissions: - contents: read - -jobs: - update-pre-commit: - permissions: - contents: write # for technote-space/create-pr-action to push code - pull-requests: write # for technote-space/create-pr-action to create a PR - if: github.repository_owner == 'pandas-dev' - name: Autoupdate pre-commit config - runs-on: ubuntu-latest - steps: - - name: Set up Python - uses: actions/setup-python@v3 - - name: Cache multiple paths - uses: actions/cache@v3 - with: - path: | - ~/.cache/pre-commit - ~/.cache/pip - key: pre-commit-autoupdate-${{ runner.os }}-build - - name: Update pre-commit config packages - uses: technote-space/create-pr-action@v2 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - EXECUTE_COMMANDS: | - pip install pre-commit - pre-commit autoupdate || (exit 0); - pre-commit run -a || (exit 0); - COMMIT_MESSAGE: "⬆️ UPGRADE: Autoupdate pre-commit config" - PR_BRANCH_NAME: "pre-commit-config-update-${PR_ID}" - PR_TITLE: "⬆️ UPGRADE: Autoupdate pre-commit config" diff --git a/.github/workflows/broken-linkcheck.yml b/.github/workflows/broken-linkcheck.yml new file mode 100644 index 0000000000000..191252cccf1c3 --- /dev/null +++ b/.github/workflows/broken-linkcheck.yml @@ -0,0 +1,39 @@ +name: Linkcheck +on: + schedule: + # Run monthly on the 1st day of the month + - cron: '0 0 1 * *' + pull_request: + paths: + - ".github/workflows/broken-linkcheck.yml" + - "doc/make.py" +jobs: + linkcheck: + if: false + runs-on: ubuntu-latest + defaults: + run: + shell: bash -el {0} + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Run linkcheck script + working-directory: ./doc + run: | + set -o pipefail + python make.py linkcheck | tee linkcheck.txt + + - name: Display broken links + if: failure() + working-directory: ./doc + run: grep broken linkcheck.txt diff --git a/.github/workflows/cache-cleanup-daily.yml b/.github/workflows/cache-cleanup-daily.yml new file mode 100644 index 0000000000000..8eadfb2ccd2a9 --- /dev/null +++ b/.github/workflows/cache-cleanup-daily.yml @@ -0,0 +1,32 @@ +name: Purge caches daily +on: + schedule: + # 4:10 UTC daily + - cron: "10 4 * * *" + +jobs: + cleanup: + runs-on: ubuntu-latest + if: github.repository_owner == 'pandas-dev' + permissions: + actions: write + steps: + - name: Clean Cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + + echo "Fetching list of cache key" + allCaches=$(gh actions-cache list -L 100 -R $REPO | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $allCaches + do + gh actions-cache delete $cacheKey -R $REPO --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml new file mode 100644 index 0000000000000..099974141c1d1 --- /dev/null +++ b/.github/workflows/cache-cleanup.yml @@ -0,0 +1,30 @@ +name: Clean closed branch caches +on: + pull_request: + types: + - closed + +jobs: + cleanup: + runs-on: ubuntu-latest + steps: + - name: Clean Cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + BRANCH="refs/pull/${{ github.event.pull_request.number }}/merge" + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 09c603f347d4c..e1d2d1ea846b8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 1.4.x + - 2.3.x pull_request: branches: - main - - 1.4.x + - 2.3.x env: ENV_FILE: environment.yml @@ -17,29 +17,11 @@ env: permissions: contents: read +# pre-commit run by https://pre-commit.ci/ jobs: - pre_commit: - name: pre-commit - runs-on: ubuntu-latest - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-pre-commit - cancel-in-progress: true - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Install Python - uses: actions/setup-python@v3 - with: - python-version: '3.9.7' - - - name: Run pre-commit - uses: pre-commit/action@v2.0.3 - - typing_and_docstring_validation: - name: Docstring and typing validation - runs-on: ubuntu-latest + docstring_typing_manual_hooks: + name: Docstring validation, typing, and other manual pre-commit hooks + runs-on: ubuntu-22.04 defaults: run: shell: bash -el {0} @@ -51,7 +33,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -61,8 +43,30 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas + with: + editable: false # The following checks are independent of each other and should still be run if one fails + + # TODO: The doctests have to be run first right now, since the Cython doctests only work + # with pandas installed in non-editable mode + # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + + - name: Run doctests + run: cd ci && ./code_checks.sh doctests + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Install pandas in editable mode + id: build-editable + if: ${{ steps.build.outcome == 'success' && always() }} + uses: ./.github/actions/build_pandas + with: + editable: true + - name: Check for no warnings when building single-page docs run: ci/code_checks.sh single-docs if: ${{ steps.build.outcome == 'success' && always() }} @@ -71,14 +75,14 @@ jobs: run: ci/code_checks.sh code if: ${{ steps.build.outcome == 'success' && always() }} - - name: Run doctests - run: ci/code_checks.sh doctests - if: ${{ steps.build.outcome == 'success' && always() }} - - name: Run docstring validation run: ci/code_checks.sh docstrings if: ${{ steps.build.outcome == 'success' && always() }} + - name: Run check of documentation notebooks + run: ci/code_checks.sh notebooks + if: ${{ steps.build.outcome == 'success' && always() }} + - name: Use existing environment for type checking run: | echo $PATH >> $GITHUB_PATH @@ -87,9 +91,9 @@ jobs: if: ${{ steps.build.outcome == 'success' && always() }} - name: Typing - uses: pre-commit/action@v2.0.3 + uses: pre-commit/action@v3.0.1 with: - extra_args: --hook-stage manual --all-files + extra_args: --verbose --hook-stage manual --all-files if: ${{ steps.build.outcome == 'success' && always() }} - name: Run docstring validation script tests @@ -98,7 +102,7 @@ jobs: asv-benchmarks: name: ASV Benchmarks - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 defaults: run: shell: bash -el {0} @@ -110,7 +114,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -125,11 +129,11 @@ jobs: run: | cd asv_bench asv machine --yes - asv run --quick --dry-run --strict --durations=30 --python=same + asv run --quick --dry-run --durations=30 --python=same --show-stderr build_docker_dev_environment: name: Build Docker Dev Environment - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 defaults: run: shell: bash -el {0} @@ -144,16 +148,19 @@ jobs: run: docker image prune -f - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Build image run: docker build --pull --no-cache --tag pandas-dev-env . + - name: Show environment + run: docker run --rm pandas-dev-env python -c "import pandas as pd; print(pd.show_versions())" + requirements-dev-text-installable: name: Test install requirements-dev.txt - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 @@ -162,15 +169,15 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 - name: Setup Python id: setup_python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: - python-version: '3.8' + python-version: '3.10' cache: 'pip' cache-dependency-path: 'requirements-dev.txt' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 457aa69fb924f..4d0066bc0b48d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -8,13 +8,17 @@ concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} cancel-in-progress: true +permissions: + contents: read + jobs: analyze: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: actions: read contents: read security-events: write + if: github.repository_owner == 'pandas-dev' strategy: fail-fast: false @@ -23,9 +27,9 @@ jobs: - python steps: - - uses: actions/checkout@v3 - - uses: github/codeql-action/init@v2 + - uses: actions/checkout@v4 + - uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - - uses: github/codeql-action/autobuild@v2 - - uses: github/codeql-action/analyze@v2 + - uses: github/codeql-action/autobuild@v3 + - uses: github/codeql-action/analyze@v3 diff --git a/.github/workflows/asv-bot.yml b/.github/workflows/comment-commands.yml similarity index 59% rename from .github/workflows/asv-bot.yml rename to .github/workflows/comment-commands.yml index abb19a95315b6..62956f5825782 100644 --- a/.github/workflows/asv-bot.yml +++ b/.github/workflows/comment-commands.yml @@ -1,30 +1,43 @@ -name: "ASV Bot" - +name: Comment Commands on: - issue_comment: # Pull requests are issues - types: - - created - -env: - ENV_FILE: environment.yml - COMMENT: ${{github.event.comment.body}} + issue_comment: + types: created permissions: contents: read + issues: write + pull-requests: write jobs: - autotune: - permissions: - contents: read - issues: write - pull-requests: write - name: "Run benchmarks" + issue_assign: + runs-on: ubuntu-22.04 + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' + concurrency: + group: ${{ github.actor }}-issue-assign + steps: + - run: | + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + preview_docs: + runs-on: ubuntu-22.04 + if: github.event.issue.pull_request && github.event.comment.body == '/preview' + concurrency: + group: ${{ github.actor }}-preview-docs + steps: + - uses: pandas-dev/github-doc-previewer@v0.3.2 + with: + previewer-server: "https://pandas.pydata.org/preview" + artifact-job: "Doc Build and Upload" + asv_run: + runs-on: ubuntu-22.04 # TODO: Support more benchmarking options later, against different branches, against self, etc - if: startsWith(github.event.comment.body, '@github-actions benchmark') - runs-on: ubuntu-latest + if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark') defaults: run: shell: bash -el {0} + env: + ENV_FILE: environment.yml + COMMENT: ${{github.event.comment.body}} concurrency: # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) @@ -36,7 +49,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -47,7 +60,7 @@ jobs: - name: Run benchmarks id: bench - continue-on-error: true # This is a fake failure, asv will exit code 1 for regressions + continue-on-error: true # asv will exit code 1 for regressions run: | # extracting the regex, see https://stackoverflow.com/a/36798723 REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p") @@ -62,7 +75,7 @@ jobs: echo 'EOF' >> $GITHUB_ENV echo "REGEX=$REGEX" >> $GITHUB_ENV - - uses: actions/github-script@v6 + - uses: actions/github-script@v7 env: BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} REGEX: ${{env.REGEX}} diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml new file mode 100644 index 0000000000000..3d4cab7be09c5 --- /dev/null +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -0,0 +1,65 @@ +# This bot updates the issue with number DEPRECATION_TRACKER_ISSUE +# with the PR number that issued the deprecation. + +# It runs on commits to main, and will trigger if the PR linked to a merged commit has the "Deprecate" label +name: Deprecations Bot + +on: + push: + branches: + - main + + +permissions: + contents: read + +jobs: + deprecation_update: + permissions: + issues: write + runs-on: ubuntu-22.04 + env: + DEPRECATION_TRACKER_ISSUE: 56596 + steps: + - uses: actions/github-script@v7 + id: update-deprecation-issue + with: + script: | + body = await github.rest.issues.get({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }}, + }) + body = body["data"]["body"]; + linkedPRs = await github.rest.repos.listPullRequestsAssociatedWithCommit({ + owner: context.repo.owner, + repo: context.repo.repo, + commit_sha: '${{ github.sha }}' + }) + linkedPRs = linkedPRs["data"]; + console.log(linkedPRs); + if (linkedPRs.length > 0) { + console.log("Found linked PR"); + linkedPR = linkedPRs[0] + isDeprecation = false + for (label of linkedPR["labels"]) { + if (label["name"] == "Deprecate") { + isDeprecation = true; + break; + } + } + + PR_NUMBER = linkedPR["number"]; + + body += ("\n- [ ] #" + PR_NUMBER); + if (isDeprecation) { + console.log("PR is a deprecation PR. Printing new body of issue"); + console.log(body); + github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }}, + body: body + }) + } + } diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 626bf7828e032..294334ca1d54b 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,15 +4,18 @@ on: push: branches: - main - - 1.4.x + - 2.3.x + tags: + - '*' pull_request: branches: - main - - 1.4.x + - 2.3.x env: ENV_FILE: environment.yml PANDAS_CI: 1 + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} permissions: contents: read @@ -20,7 +23,7 @@ permissions: jobs: web_and_docs: name: Doc Build and Upload - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 @@ -33,7 +36,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -43,6 +46,13 @@ jobs: - name: Build Pandas uses: ./.github/actions/build_pandas + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 + + - name: Test website + run: python -m pytest web/ + - name: Build website run: python web/pandas_web.py web/pandas --target-path=web/build @@ -50,34 +60,40 @@ jobs: run: doc/make.py --warnings-are-errors - name: Build the interactive terminal - run: | - cd web/interactive_terminal - jupyter lite build + working-directory: web/interactive_terminal + run: jupyter lite build + + - name: Build documentation zip + run: doc/make.py zip_html - name: Install ssh key run: | mkdir -m 700 -p ~/.ssh echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: github.event_name == 'push' && github.ref == 'refs/heads/main' + echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBFjYkJBk7sos+r7yATODogQc3jUdW1aascGpyOD4bohj8dWjzwLJv/OJ/fyOQ5lmj81WKDk67tGtqNJYGL9acII=" > ~/.ssh/known_hosts + if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) - name: Copy cheatsheets into site directory run: cp doc/cheatsheet/Pandas_Cheat_Sheet* web/build/ - name: Upload web - run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas + run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='benchmarks' web/build/ web@${{ secrets.server_ip }}:/var/www/html if: github.event_name == 'push' && github.ref == 'refs/heads/main' - name: Upload dev docs - run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/dev if: github.event_name == 'push' && github.ref == 'refs/heads/main' + - name: Upload prod docs + run: rsync -az --delete doc/build/html/ web@${{ secrets.server_ip }}:/var/www/html/pandas-docs/version/${GITHUB_REF_NAME:1} + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + - name: Move docs into site directory run: mv doc/build/html web/build/docs - name: Save website as an artifact - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: website path: web/build diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml deleted file mode 100644 index e9503a2486560..0000000000000 --- a/.github/workflows/macos-windows.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Windows-MacOS - -on: - push: - branches: - - main - - 1.4.x - pull_request: - branches: - - main - - 1.4.x - paths-ignore: - - "doc/**" - -env: - PANDAS_CI: 1 - PYTEST_TARGET: pandas - PATTERN: "not slow and not db and not network and not single_cpu" - - -permissions: - contents: read - -jobs: - pytest: - defaults: - run: - shell: bash -el {0} - timeout-minutes: 120 - strategy: - matrix: - os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] - fail-fast: false - runs-on: ${{ matrix.os }} - name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} - cancel-in-progress: true - env: - # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ci/deps/${{ matrix.env_file }} - pyarrow-version: ${{ matrix.os == 'macos-latest' && '6' || '' }} - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml new file mode 100644 index 0000000000000..331af6e05b650 --- /dev/null +++ b/.github/workflows/package-checks.yml @@ -0,0 +1,81 @@ +name: Package Checks + +on: + push: + branches: + - main + - 2.3.x + pull_request: + branches: + - main + - 2.3.x + types: [ labeled, opened, synchronize, reopened ] + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + pip: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "all"] + fail-fast: false + name: Install Extras - ${{ matrix.extra }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-pip-extras-${{ matrix.extra }} + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup Python + id: setup_python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Pip install with extra + run: | + python -m pip install .[${{ matrix.extra }}] -v + shell: bash -el {0} + conda_forge_recipe: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.10', '3.11'] + fail-fast: false + name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-conda-forge-recipe-${{ matrix.python-version }} + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: mamba-org/setup-micromamba@v2 + with: + environment-name: recipe-test + create-args: >- + python=${{ matrix.python-version }} + boa + conda-verify + cache-downloads: true + cache-environment: true + + - name: Build conda package + run: conda mambabuild ci --no-anaconda-upload --verify --strict-verify --output --output-folder . diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml deleted file mode 100644 index d93b92a9662ec..0000000000000 --- a/.github/workflows/python-dev.yml +++ /dev/null @@ -1,79 +0,0 @@ -# This file is purposely frozen(does not run). DO NOT DELETE IT -# Unfreeze(by commentingthe if: false() condition) once the -# next Python Dev version has released beta 1 and both Cython and numpy support it -# After that Python has released, migrate the workflows to the -# posix GHA workflows and "freeze" this file by -# uncommenting the if: false() condition -# Feel free to modify this comment as necessary. - -name: Python Dev - -on: - push: - branches: - - main - - 1.4.x - pull_request: - branches: - - main - - 1.4.x - paths-ignore: - - "doc/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard and not single_cpu" - COVERAGE: true - PYTEST_TARGET: pandas - -permissions: - contents: read - -jobs: - build: - if: false # Comment this line out to "unfreeze" - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-latest, macOS-latest, windows-latest] - - name: actions-311-dev - timeout-minutes: 80 - - concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python Dev Version - uses: actions/setup-python@v3 - with: - python-version: '3.11-dev' - - - name: Install dependencies - shell: bash -el {0} - run: | - python3 -m pip install --upgrade pip setuptools wheel - python3 -m pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy - python3 -m pip install git+https://github.com/nedbat/coveragepy.git - python3 -m pip install cython python-dateutil pytz hypothesis pytest>=6.2.5 pytest-xdist pytest-cov pytest-asyncio>=0.17 - python3 -m pip list - - - name: Build Pandas - run: | - python3 setup.py build_ext -q -j2 - python3 -m pip install -e . --no-build-isolation --no-use-pep517 - - - name: Build Version - run: | - python3 -c "import pandas; pandas.show_versions();" - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml deleted file mode 100644 index 1a06ea31ccbb8..0000000000000 --- a/.github/workflows/sdist.yml +++ /dev/null @@ -1,93 +0,0 @@ -name: sdist - -on: - push: - branches: - - main - - 1.4.x - pull_request: - branches: - - main - - 1.4.x - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - build: - if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-latest - timeout-minutes: 60 - defaults: - run: - shell: bash -el {0} - - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10"] - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - - # GH 39416 - pip install numpy - - - name: Build pandas sdist - run: | - pip list - python setup.py sdist --formats=gztar - - - name: Upload sdist artifact - uses: actions/upload-artifact@v3 - with: - name: ${{matrix.python-version}}-sdist.gz - path: dist/*.gz - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: false - environment-name: pandas-sdist - extra-specs: | - python =${{ matrix.python-version }} - - - name: Install pandas from sdist - run: | - pip list - python -m pip install dist/*.gz - - - name: Force oldest supported NumPy - run: | - case "${{matrix.python-version}}" in - 3.8) - pip install numpy==1.20.3 ;; - 3.9) - pip install numpy==1.20.3 ;; - 3.10) - pip install numpy==1.21.2 ;; - esac - - - name: Import pandas - run: | - cd .. - conda list - python -c "import pandas; pandas.show_versions();" diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 69656be18a8b1..792afe8f4faf5 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -11,9 +11,10 @@ jobs: stale: permissions: pull-requests: write - runs-on: ubuntu-latest + if: github.repository_owner == 'pandas-dev' + runs-on: ubuntu-22.04 steps: - - uses: actions/stale@v4 + - uses: actions/stale@v9 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please [update](https://pandas.pydata.org/pandas-docs/stable/development/contributing.html#updating-your-pull-request) and respond to this comment if you're still interested in working on this." diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index a759280c74521..0000000000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,156 +0,0 @@ -name: Ubuntu - -on: - push: - branches: - - main - - 1.4.x - pull_request: - branches: - - main - - 1.4.x - paths-ignore: - - "doc/**" - -env: - PANDAS_CI: 1 - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -el {0} - timeout-minutes: 120 - strategy: - matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml] - pattern: ["not single_cpu", "single_cpu"] - # Don't test pyarrow v2/3: Causes timeouts in read_csv engine - # even if tests are skipped/xfailed - pyarrow_version: ["5", "6", "7"] - include: - - name: "Downstream Compat" - env_file: actions-38-downstream_compat.yaml - pattern: "not slow and not network and not single_cpu" - pytest_target: "pandas/tests/test_downstream.py" - - name: "Minimum Versions" - env_file: actions-38-minimum_versions.yaml - pattern: "not slow and not network and not single_cpu" - - name: "Locale: it_IT.utf8" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-it" - lang: "it_IT.utf8" - lc_all: "it_IT.utf8" - - name: "Locale: zh_CN.utf8" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-zh-hans" - lang: "zh_CN.utf8" - lc_all: "zh_CN.utf8" - - name: "Data Manager" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - pandas_data_manager: "array" - - name: "Pypy" - env_file: actions-pypy-38.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "--max-worker-restart 0" - - name: "Numpy Dev" - env_file: actions-310-numpydev.yaml - pattern: "not slow and not network and not single_cpu" - pandas_testing_mode: "deprecate" - test_args: "-W error::DeprecationWarning:numpy" - exclude: - - env_file: actions-39.yaml - pyarrow_version: "6" - - env_file: actions-39.yaml - pyarrow_version: "7" - - env_file: actions-310.yaml - pyarrow_version: "6" - - env_file: actions-310.yaml - pyarrow_version: "7" - fail-fast: false - name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} - env: - ENV_FILE: ci/deps/${{ matrix.env_file }} - PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} - LANG: ${{ matrix.lang || '' }} - LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }} - PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} - TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }} - PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - IS_PYPY: ${{ contains(matrix.env_file, 'pypy') }} - # TODO: re-enable coverage on pypy, its slow - COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} - cancel-in-progress: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - moto: - image: motoserver/moto - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - ports: - - 5000:5000 - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ${{ env.ENV_FILE }} - pyarrow-version: ${{ matrix.pyarrow_version }} - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - uses: ./.github/actions/run-tests - # TODO: Don't continue on error for PyPy - continue-on-error: ${{ env.IS_PYPY == 'true' }} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000000..08c41a1eeb21f --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,468 @@ +name: Unit Tests + +on: + push: + branches: + - main + - 2.3.x + pull_request: + branches: + - main + - 2.3.x + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + ubuntu: + runs-on: ${{ matrix.platform }} + timeout-minutes: 90 + strategy: + matrix: + platform: [ubuntu-22.04, ubuntu-24.04-arm] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] + # Prevent the include jobs from overriding other jobs + pattern: [""] + pandas_future_infer_string: ["0"] + include: + - name: "Downstream Compat" + env_file: actions-311-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + platform: ubuntu-22.04 + - name: "Minimum Versions" + env_file: actions-310-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + platform: ubuntu-22.04 + - name: "Locale: it_IT" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + platform: ubuntu-22.04 + - name: "Locale: zh_CN" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" + platform: ubuntu-22.04 + - name: "Future infer strings" + env_file: actions-312.yaml + pandas_future_infer_string: "1" + platform: ubuntu-22.04 + - name: "Future infer strings (without pyarrow)" + env_file: actions-311.yaml + pandas_future_infer_string: "1" + platform: ubuntu-22.04 + - name: "Pypy" + env_file: actions-pypy-39.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + platform: ubuntu-22.04 + - name: "Numpy Dev" + env_file: actions-311-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + platform: ubuntu-22.04 + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" + pandas_future_infer_string: "1" + platform: ubuntu-22.04 + fail-fast: false + name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }} + env: + PATTERN: ${{ matrix.pattern }} + LANG: ${{ matrix.lang || 'C.UTF-8' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_CI: '1' + PANDAS_FUTURE_INFER_STRING: ${{ matrix.pandas_future_infer_string || '0' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: 'auto' + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + # Clipboard tests + QT_QPA_PLATFORM: offscreen + REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }} + cancel-in-progress: true + + services: + mysql: + image: mysql:9 + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres:17 + env: + PGUSER: postgres + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto:5.0.27 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Extra installs + # https://pytest-qt.readthedocs.io/en/latest/troubleshooting.html#github-actions-azure-pipelines-travis-ci-and-gitlab-ci-cd + run: sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 ${{ matrix.extra_apt || ''}} + + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + # TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge + if: ${{ matrix.name != 'Pypy' }} + + - name: Test (not single_cpu) + uses: ./.github/actions/run-tests + if: ${{ matrix.name != 'Pypy' }} + env: + # Set pattern to not single_cpu if not already set + PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} + + - name: Test (single_cpu) + uses: ./.github/actions/run-tests + env: + PATTERN: 'single_cpu' + PYTEST_WORKERS: 0 + if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} + + macos-windows: + timeout-minutes: 90 + strategy: + matrix: + # Note: Don't use macOS latest since macos 14 appears to be arm64 only + os: [macos-13, macos-14, windows-latest] + env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + PYTEST_WORKERS: 'auto' + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + + Linux-32-bit: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/manylinux2014_i686 + options: --platform linux/386 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Build environment and Run Tests + # https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388 + run: | + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install numpy -Csetup-args="-Dallow-noblas=true" + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" + python -m pip list --no-cache-dir + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit + cancel-in-progress: true + + Linux-Musl: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/musllinux_1_2_x86_64 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Configure System Packages + run: | + apk update + apk add musl-locales + - name: Build environment + run: | + /opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0 + python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror" + python -m pip list --no-cache-dir + + - name: Run Tests + run: | + . ~/virtualenvs/pandas-dev/bin/activate + PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl + cancel-in-progress: true + + python-dev: + # This job may or may not run depending on the state of the next + # unreleased Python version. DO NOT DELETE IT. + # + # In general, this will remain frozen(present, but not running) until: + # - The next unreleased Python version has released beta 1 + # - This version should be available on GitHub Actions. + # - Our required build/runtime dependencies(numpy, Cython, python-dateutil) + # support that unreleased Python version. + # To unfreeze, comment out the ``if: false`` condition, and make sure you update + # the name of the workflow and Python version in actions/setup-python ``python-version:`` + # + # After it has been unfrozen, this file should remain unfrozen(present, and running) until: + # - The next Python version has been officially released. + # OR + # - Most/All of our optional dependencies support the next Python version AND + # - The next Python version has released a rc(we are guaranteed a stable ABI). + # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs + # to the corresponding posix/windows-macos/sdist etc. workflows. + # Feel free to modify this comment as necessary. + # if: false # Uncomment this to freeze the workflow, comment it to unfreeze + defaults: + run: + shell: bash -eou pipefail {0} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # Separate out macOS 13 and 14, since macOS 14 is arm64 only + os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest] + + timeout-minutes: 90 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-python-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v5 + with: + python-version: '3.13-dev' + + - name: Build Environment + run: | + python --version + python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy + python -m pip install versioneer[toml] python-dateutil tzdata cython hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" + python -m pip list + + - name: Run Tests + uses: ./.github/actions/run-tests + + python-freethreading: + defaults: + run: + shell: bash -eou pipefail {0} + runs-on: ubuntu-22.04 + + timeout-minutes: 90 + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python Free-threading Version + uses: deadsnakes/action@v3.2.0 + with: + python-version: 3.13-dev + nogil: true + + - name: Build Environment + run: | + python --version + python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1 + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython + python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov + python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror" + python -m pip list + + - name: Run Tests + uses: ./.github/actions/run-tests + + # NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml + emscripten: + # Note: the Python version, Emscripten toolchain version are determined + # by the Pyodide version. The appropriate versions can be found in the + # Pyodide repodata.json "info" field, or in the Makefile.envs file: + # https://github.com/pyodide/pyodide/blob/stable/Makefile.envs#L2 + # The Node.js version can be determined via Pyodide: + # https://pyodide.org/en/stable/usage/index.html#node-js + name: Pyodide build + runs-on: ubuntu-22.04 + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm + cancel-in-progress: true + steps: + - name: Checkout pandas Repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python for pyodide-build + id: setup-python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Set up Emscripten toolchain + uses: mymindstorm/setup-emsdk@v14 + with: + version: '3.1.58' + actions-cache-folder: emsdk-cache + + - name: Install pyodide-build + run: pip install "pyodide-build>=0.29.2" + + - name: Build pandas for Pyodide + run: | + pyodide build + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Set up Pyodide virtual environment + env: + pyodide-version: '0.27.1' + run: | + pyodide xbuildenv install ${{ env.pyodide-version }} + pyodide venv .venv-pyodide + source .venv-pyodide/bin/activate + pip install dist/*.whl + + - name: Test pandas for Pyodide + env: + PANDAS_CI: 1 + run: | + source .venv-pyodide/bin/activate + pip install pytest hypothesis + # do not import pandas from the checked out repo + cd .. + python -c 'import pandas as pd; pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db"])' diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000000000..2dcc79085734b --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,201 @@ +# Workflow to build wheels for upload to PyPI. +# Inspired by numpy's cibuildwheel config https://github.com/numpy/numpy/blob/main/.github/workflows/wheels.yml +# +# In an attempt to save CI resources, wheel builds do +# not run on each push but only weekly and for releases. +# Wheel builds can be triggered from the Actions page +# (if you have the permissions) on a commit to main. +# +# Alternatively, you can add labels to the pull request in order to trigger wheel +# builds. +# The label(s) that trigger builds are: +# - Build +name: Wheel builder + +on: + schedule: + # 3:27 UTC every day + - cron: "27 3 * * *" + push: + pull_request: + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + - "web/**" + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + build_sdist: + name: Build sdist + if: >- + (github.event_name == 'schedule') || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'Build')) || + (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + runs-on: ubuntu-22.04 + env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + outputs: + sdist_file: ${{ steps.save-path.outputs.sdist_name }} + steps: + - name: Checkout pandas + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Build sdist + run: | + python -m pip install build + python -m build --sdist + + - uses: actions/upload-artifact@v4 + with: + name: sdist + path: ./dist/* + + - name: Sanity check sdist files + run: | + ls ./dist + + - name: Output sdist name + id: save-path + shell: bash -el {0} + run: echo "sdist_name=$(ls ./dist)" >> "$GITHUB_OUTPUT" + + build_wheels: + needs: build_sdist + name: Build wheel for ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + if: >- + (github.event_name == 'schedule') || + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && + contains(github.event.pull_request.labels.*.name, 'Build')) || + (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) + runs-on: ${{ matrix.buildplat[0] }} + strategy: + fail-fast: false + matrix: + # GitHub Actions doesn't support pairing matrix values together, let's improvise + # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 + buildplat: + - [ubuntu-22.04, manylinux_x86_64] + - [ubuntu-22.04, musllinux_x86_64] + - [ubuntu-24.04-arm, manylinux_aarch64] + - [macos-13, macosx_x86_64] + # Note: M1 images on Github Actions start from macOS 14 + - [macos-14, macosx_arm64] + - [windows-2022, win_amd64] + # TODO: support PyPy? + python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] + include: + # TODO: Remove this plus installing build deps in cibw_before_build.sh + # after pandas can be built with a released NumPy/Cython + - python: ["cp313t", "3.13"] + cibw_build_frontend: 'pip; args: --no-build-isolation' + # Build Pyodide wheels and upload them to Anaconda.org + # NOTE: this job is similar to the one in unit-tests.yml except for the fact + # that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup. + - buildplat: [ubuntu-22.04, pyodide_wasm32] + python: ["cp312", "3.12"] + cibw_build_frontend: 'build' + + env: + IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} + IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + steps: + - name: Checkout pandas + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + # TODO: Build wheels from sdist again + # There's some sort of weird race condition? + # within Github that makes the sdist be missing files + + # We need to build wheels from the sdist since the sdist + # removes unnecessary files from the release + - name: Download sdist (not macOS) + #if: ${{ matrix.buildplat[1] != 'macosx_*' }} + uses: actions/download-artifact@v4 + with: + name: sdist + path: ./dist + + - name: Output sdist name (macOS) + id: save-path + shell: bash -el {0} + run: echo "sdist_name=$(ls ./dist)" >> "$GITHUB_ENV" + + # Python version used to build sdist doesn't matter + # wheel will be built from sdist with the correct version + - name: Unzip sdist (macOS) + if: ${{ startsWith(matrix.buildplat[1], 'macosx') }} + run: | + tar -xzf ./dist/${{ env.sdist_name }} -C ./dist + + - name: Output sdist name (macOS) + id: save-path2 + shell: bash -el {0} + run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" + + - name: Build wheels + uses: pypa/cibuildwheel@v2.23.1 + with: + package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} + env: + CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }} + CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} + + - name: Set up Python + uses: mamba-org/setup-micromamba@v2 + with: + environment-name: wheel-env + # Use a fixed Python, since we might have an unreleased Python not + # yet present on conda-forge + create-args: >- + python=3.11 + anaconda-client + wheel + cache-downloads: true + cache-environment: true + + - name: Validate wheel RECORD + shell: bash -el {0} + run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done + + - uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + path: ./wheelhouse/*.whl + + - name: Upload wheels & sdist + if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + shell: bash -el {0} + env: + PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }} + PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} + # trigger an upload to + # https://anaconda.org/scientific-python-nightly-wheels/pandas + # for cron jobs or "Run workflow" (restricted to main branch). + # Tags will upload to + # https://anaconda.org/multibuild-wheels-staging/pandas + # The tokens were originally generated at anaconda.org + run: | + source ci/upload_wheels.sh + set_upload_vars + upload_wheels diff --git a/.gitignore b/.gitignore index 07b1f056d511b..d951f3fb9cbad 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,11 @@ *.py[ocd] *.so .build_cache_dir +.mesonpy-native-file.ini MANIFEST +compile_commands.json +debug +.debug # Python files # ################ @@ -53,6 +57,9 @@ dist # type checkers pandas/py.typed +# pyenv +.python-version + # tox testing tool .tox # rope @@ -67,11 +74,14 @@ coverage.xml coverage_html_report .mypy_cache *.pytest_cache +.ruff_cache # hypothesis test database .hypothesis/ __pycache__ # pytest-monkeytype monkeytype.sqlite3 +# meson editable install folder +.mesonpy # OS generated files # @@ -95,10 +105,11 @@ scikits # Generated Sources # ##################### !skts.c -!np_datetime.c -!np_datetime_strings.c *.c *.cpp +!pandas/_libs/src/**/*.c +!pandas/_libs/src/**/*.h +!pandas/_libs/include/**/*.h # Unit / Performance Testing # ############################## @@ -126,3 +137,7 @@ doc/source/savefig/ # Interactive terminal generated files # ######################################## .jupyterlite.doit.db + +# Pyodide/WASM related files # +############################## +/.pyodide-xbuildenv-* diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000000000..5bf028750f30f --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,60 @@ +# Building pandas on init +# Might delegate this later to prebuild with Q2 improvements on gitpod +# https://www.gitpod.io/docs/config-start-tasks/#configuring-the-terminal +# ------------------------------------------------------------------------- + +# images for gitpod pandas are in https://hub.docker.com/r/pandas/pandas-gitpod/tags +# we're using the Dockerfile in the base of the repo +image: + file: Dockerfile +tasks: + - name: Prepare development environment + init: | + mkdir -p .vscode + cp gitpod/settings.json .vscode/settings.json + git fetch --tags + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true + pre-commit install --install-hooks + command: | + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true + echo "✨ Pre-build complete! You can close this terminal ✨ " + +# -------------------------------------------------------- +# exposing ports for liveserve +ports: + - port: 5500 + onOpen: notify + +# -------------------------------------------------------- +# some useful extensions to have +vscode: + extensions: + - ms-python.python + - yzhang.markdown-all-in-one + - eamodio.gitlens + - lextudio.restructuredtext + - ritwickdey.liveserver + # add or remove what you think is generally useful to most contributors + # avoid adding too many. they each open a pop-up window + +# -------------------------------------------------------- +# Using prebuilds for the container +# With this configuration the prebuild will happen on push to main +github: + prebuilds: + # enable for main/default branch + main: true + # enable for other branches (defaults to false) + branches: false + # enable for pull requests coming from this repo (defaults to true) + pullRequests: false + # enable for pull requests coming from forks (defaults to false) + pullRequestsFromForks: false + # add a check to pull requests (defaults to true) + addCheck: false + # add a "Review in Gitpod" button as a comment to pull requests (defaults to false) + addComment: false + # add a "Review in Gitpod" button to the pull request's description (defaults to false) + addBadge: false + # add a label once the prebuild is ready to pull requests (defaults to false) + addLabel: false diff --git a/.pep8speaks.yml b/.pep8speaks.yml deleted file mode 100644 index 5a83727ddf5f8..0000000000000 --- a/.pep8speaks.yml +++ /dev/null @@ -1,4 +0,0 @@ -# File : .pep8speaks.yml - -scanner: - diff_only: True # If True, errors caused by only the patch are shown diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dbddba57ef21c..09bfda1755e03 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,87 +1,116 @@ minimum_pre_commit_version: 2.15.0 exclude: ^LICENSES/|\.(html|csv|svg)$ -# reserve "manual" for mypy and pyright -default_stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] +# reserve "manual" for relatively slow hooks which we still want to run in CI +default_stages: [ + pre-commit, + pre-merge-commit, + pre-push, + prepare-commit-msg, + commit-msg, + post-checkout, + post-commit, + post-merge, + post-rewrite +] ci: autofix_prs: false + autoupdate_schedule: monthly + # manual stage hooks + skip: [pyright, mypy] repos: -- repo: https://github.com/MarcoGorelli/absolufy-imports - rev: v0.3.1 +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.9 hooks: - - id: absolufy-imports - files: ^pandas/ + - id: ruff + args: [--exit-non-zero-on-fix] + exclude: ^pandas/tests/frame/test_query_eval.py + - id: ruff + # TODO: remove autofixe-only rules when they are checked by ruff + name: ruff-selected-autofixes + alias: ruff-selected-autofixes + files: ^pandas + exclude: ^pandas/tests + args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] + - id: ruff-format + exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.4' + rev: 'v2.14' hooks: - id: vulture entry: python scripts/run_vulture.py pass_filenames: true require_serial: false -- repo: https://github.com/python/black - rev: 22.6.0 - hooks: - - id: black - repo: https://github.com/codespell-project/codespell - rev: v2.1.0 + rev: v2.4.1 hooks: - id: codespell - types_or: [python, rst, markdown] + types_or: [python, rst, markdown, cython, c] + additional_dependencies: [tomli] +- repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.16.6 + hooks: + - id: cython-lint + - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v5.0.0 hooks: - - id: debug-statements + - id: check-case-conflict + - id: check-toml + - id: check-xml + - id: check-yaml + exclude: ^ci/meta.yaml$ - id: end-of-file-fixer exclude: \.txt$ - stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] + - id: mixed-line-ending + args: [--fix=auto] + exclude: ^pandas/tests/io/parser/data/utf16_ex.txt$ + - id: fix-byte-order-marker + - id: fix-encoding-pragma + args: [--remove] - id: trailing-whitespace - stages: [commit, merge-commit, push, prepare-commit-msg, commit-msg, post-checkout, post-commit, post-merge, post-rewrite] -- repo: https://github.com/cpplint/cpplint - rev: 1.6.0 - hooks: - - id: cpplint - # We don't lint all C files because we don't want to lint any that are built - # from Cython files nor do we want to lint C files that we didn't modify for - # this particular codebase (e.g. src/headers, src/klib). However, - # we can lint all header files since they aren't "generated" like C files are. - exclude: ^pandas/_libs/src/(klib|headers)/ - args: [--quiet, '--extensions=c,h', '--headers=h', --recursive, '--filter=-readability/casting,-runtime/int,-build/include_subdir'] -- repo: https://github.com/PyCQA/flake8 - rev: 4.0.1 - hooks: - - id: flake8 - additional_dependencies: &flake8_dependencies - - flake8==4.0.1 - - flake8-comprehensions==3.7.0 - - flake8-bugbear==21.3.2 - - pandas-dev-flaker==0.5.0 + args: [--markdown-linebreak-ext=md] - repo: https://github.com/PyCQA/isort - rev: 5.10.1 + rev: 6.0.1 hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.34.0 + rev: v3.19.1 hooks: - id: pyupgrade - args: [--py38-plus] + args: [--py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.9.0 + rev: v1.10.0 hooks: - id: rst-backticks - id: rst-directive-colons types: [text] # overwrite types: [rst] types_or: [python, rst] - id: rst-inline-touching-normal + exclude: ^pandas/tests/frame/test_query_eval.py types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.6.1 + rev: v1.0.0 hooks: - id: sphinx-lint -- repo: https://github.com/asottile/yesqa - rev: v1.3.0 + args: ["--enable", "all", "--disable", "line-too-long"] +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v19.1.7 + hooks: + - id: clang-format + files: ^pandas/_libs/src|^pandas/_libs/include + args: [-i] + types_or: [c, c++] +- repo: https://github.com/trim21/pre-commit-mirror-meson + rev: v1.7.0 hooks: - - id: yesqa - additional_dependencies: *flake8_dependencies + - id: meson-fmt + args: ['--inplace'] +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck + args: ["--severity=warning"] - repo: local hooks: - id: pyright @@ -93,11 +122,11 @@ repos: types: [python] stages: [manual] additional_dependencies: &pyright_dependencies - - pyright@1.1.264 - - id: pyright_reportGeneralTypeIssues + - pyright@1.1.383 + - id: pyright # note: assumes python env is setup and activated name: pyright reportGeneralTypeIssues - entry: pyright --skipunannotated -p pyright_reportGeneralTypeIssues.json + entry: pyright -p pyright_reportGeneralTypeIssues.json --level warning language: node pass_filenames: false types: [python] @@ -111,7 +140,7 @@ repos: pass_filenames: false types: [python] stages: [manual] - - id: stubtest + - id: mypy # note: assumes python env is setup and activated # note: requires pandas dev to be installed name: mypy (stubtest) @@ -121,22 +150,26 @@ repos: types: [pyi] args: [scripts/run_stubtest.py] stages: [manual] - - id: flake8-rst - name: flake8-rst - description: Run flake8 on code snippets in docstrings or RST files + - id: inconsistent-namespace-usage + name: 'Check for inconsistent use of pandas namespace' + entry: python scripts/check_for_inconsistent_pandas_namespace.py + exclude: ^pandas/core/interchange/ language: python - entry: flake8-rst - types: [rst] - args: [--filename=*.rst] - additional_dependencies: [flake8-rst==0.7.0, flake8==3.7.9] + types: [python] - id: unwanted-patterns name: Unwanted patterns language: pygrep entry: | (?x) - # outdated annotation syntax, missing error codes + # outdated annotation syntax \#\ type:\ (?!ignore) - |\#\ type:\s?ignore(?!\[) + + # foo._class__ instead of type(foo) + |\.__class__ + + # Numpy + |from\ numpy\ import\ random + |from\ numpy\.random\ import # Incorrect code-block / IPython directives |\.\.\ code-block\ :: @@ -146,12 +179,11 @@ repos: # Check for deprecated messages without sphinx directive |(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.) + + # builtin filter function + |(?obj`, not ` obj` - language: pygrep - entry: '[a-zA-Z0-9*]> ' - files: (\.pyx|\.pxi.in)$ + exclude: ^doc/source/development/code_style\.rst # contains examples of patterns to avoid - id: incorrect-backticks name: Check for backticks incorrectly rendering because of missing spaces language: pygrep @@ -164,59 +196,84 @@ repos: entry: 'np\.random\.seed' files: ^asv_bench/benchmarks exclude: ^asv_bench/benchmarks/pandas_vb_common\.py - - id: np-testing-array-equal - name: Check for usage of numpy testing or array_equal + - id: unwanted-patterns-in-tests + name: Unwanted patterns in tests language: pygrep - entry: '(numpy|np)(\.testing|\.array_equal)' + entry: | + (?x) + # imports from pandas._testing instead of `import pandas._testing as tm` + from\ pandas\._testing\ import + |from\ pandas\ import\ _testing\ as\ tm + + # pandas.testing instead of tm + |pd\.testing\. + + # pd.api.types instead of from pandas.api.types import ... + |(pd|pandas)\.api\.types\. + + # np.array_equal + |(numpy|np)\.array_equal + + # pytest raises without context + |\s\ pytest.raises + + # Unseeded numpy default_rng + |default_rng\(\) files: ^pandas/tests/ - types: [python] - - id: invalid-ea-testing - name: Check for invalid EA testing + types_or: [python, cython, rst] + - id: unwanted-patterns-in-cython + name: Unwanted patterns in Cython code language: pygrep - entry: 'tm\.assert_(series|frame)_equal' - files: ^pandas/tests/extension/base - types: [python] - exclude: ^pandas/tests/extension/base/base\.py + entry: | + (?x) + # `obj` as opposed to ` obj` + [a-zA-Z0-9*]>[ ] + types: [cython] - id: pip-to-conda name: Generate pip dependency from conda - description: This hook checks if the conda environment.yml and requirements-dev.txt are equal language: python entry: python scripts/generate_pip_deps_from_conda.py files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false - additional_dependencies: [pyyaml, toml] - - id: sync-flake8-versions - name: Check flake8 version is synced across flake8, yesqa, and environment.yml - language: python - entry: python scripts/sync_flake8_versions.py - files: ^(\.pre-commit-config\.yaml|environment\.yml)$ - pass_filenames: false - additional_dependencies: [pyyaml] + additional_dependencies: [tomli, pyyaml] - id: title-capitalization name: Validate correct capitalization among titles in documentation entry: python scripts/validate_rst_title_capitalization.py language: python types: [rst] files: ^doc/source/(development|reference)/ - - id: use-pd_array-in-core - name: Import pandas.array as pd_array in core + - id: unwanted-patterns-private-function-across-module + name: Check for use of private functions across modules language: python - entry: python scripts/use_pd_array_in_core.py - files: ^pandas/core/ - exclude: ^pandas/core/api\.py$ + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" types: [python] - - id: use-io-common-urlopen - name: Use pandas.io.common.urlopen instead of urllib.request.urlopen + exclude: ^(asv_bench|pandas/tests|doc)/ + - id: unwanted-patterns-private-import-across-module + name: Check for import of private attributes across modules language: python - entry: python scripts/use_io_common_urlopen.py - files: ^pandas/ - exclude: ^pandas/tests/ + entry: python scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" types: [python] - - id: no-bool-in-core-generic - name: Use bool_t instead of bool in pandas/core/generic.py - entry: python scripts/no_bool_in_generic.py + exclude: | + (?x) + ^(asv_bench|pandas/tests|doc)/ + |scripts/validate_min_versions_in_sync\.py$ + - id: unwanted-patterns-strings-with-misplaced-whitespace + name: Check for strings with misplaced spaces language: python - files: ^pandas/core/generic\.py$ + entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" + types_or: [python, cython] + - id: unwanted-patterns-nodefault-used-not-only-for-typing + name: Check that `pandas._libs.lib.NoDefault` is used only for typing + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="nodefault_used_not_only_for_typing" + types: [python] + - id: no-return-exception + name: Use raise instead of return for exceptions + language: pygrep + entry: 'return [A-Za-z]+(Error|Exit|Interrupt|Exception|Iteration)' + files: ^pandas/ + types: [python] + exclude: ^pandas/tests/ - id: pandas-errors-documented name: Ensure pandas errors are documented in doc/source/reference/testing.rst entry: python scripts/pandas_errors_documented.py @@ -230,26 +287,29 @@ repos: types: [yaml] - id: validate-min-versions-in-sync name: Check minimum version of dependencies are aligned - entry: python scripts/validate_min_versions_in_sync.py + entry: python -m scripts.validate_min_versions_in_sync language: python files: ^(ci/deps/actions-.*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ - - id: flake8-pyi - name: flake8-pyi - entry: flake8 --extend-ignore=E301,E302,E305,E701,E704 - types: [pyi] + additional_dependencies: [tomli, pyyaml] + pass_filenames: false + - id: validate-errors-locations + name: Validate errors locations + description: Validate errors are in appropriate locations. + entry: python scripts/validate_exception_location.py language: python - additional_dependencies: - - flake8==4.0.1 - - flake8-pyi==22.7.0 - - id: future-annotations - name: import annotations from __future__ - entry: 'from __future__ import annotations' - language: pygrep - args: [--negate] files: ^pandas/ + exclude: ^(pandas/_libs/|pandas/tests/|pandas/errors/__init__.py$|pandas/_version.py) types: [python] - exclude: | - (?x) - /(__init__\.py)|(api\.py)|(_version\.py)|(testing\.py)|(conftest\.py)$ - |/tests/ - |/_testing/ + - id: check-test-naming + name: check that test names start with 'test' + entry: python -m scripts.check_test_naming + types: [python] + files: ^pandas/tests + language: python + - id: sort-whatsnew-items + name: sort whatsnew entries alphabetically + entry: python -m scripts.sort_whatsnew_note + types: [rst] + language: python + files: ^doc/source/whatsnew/v + exclude: ^doc/source/whatsnew/v(0|1|2\.0\.0) diff --git a/CITATION.cff b/CITATION.cff index 0161dfa92fdef..11f45b0d87ec7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -3,8 +3,50 @@ title: 'pandas-dev/pandas: Pandas' message: 'If you use this software, please cite it as below.' authors: - name: "The pandas development team" + website: "https://pandas.pydata.org/about/team.html" +abstract: "Pandas is a powerful data structures for data analysis, time series, and statistics." +doi: 10.5281/zenodo.3509134 license: BSD-3-Clause license-url: "https://github.com/pandas-dev/pandas/blob/main/LICENSE" repository-code: "https://github.com/pandas-dev/pandas" +keywords: + - python + - data science + - flexible + - pandas + - alignment + - data analysis type: software -url: "https://github.com/pandas-dev/pandas" +url: "https://pandas.pydata.org/" +references: + - type: article + authors: + - given-names: Wes + family-names: McKinney + affiliation: AQR Capital Management, LLC + email: wesmckinn@gmail.com + title: Data Structures for Statistical Computing in Python + doi: 10.25080/Majora-92bf1922-00a + license: CC-BY-3.0 + start: 56 + end: 61 + year: 2010 + collection-title: Proceedings of the 9th Python in Science Conference + collection-doi: 10.25080/Majora-92bf1922-012 + collection-type: proceedings + editors: + - given-names: Stéfan + name-particle: van der + family-names: Walt + - given-names: Jarrod + family-names: Millman + conference: + name: 9th Python in Science Conference (SciPy 2010) + city: Austin, TX + country: US + date-start: "2010-06-28" + date-end: "2010-07-03" + keywords: + - data structure + - statistics + - R diff --git a/Dockerfile b/Dockerfile index 650ba14271092..4090a4adb1af8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,53 +1,17 @@ -FROM quay.io/condaforge/miniforge3 +FROM python:3.10.8 +WORKDIR /home/pandas -# if you forked pandas, you can pass in your own GitHub username to use your fork -# i.e. gh_username=myname -ARG gh_username=pandas-dev -ARG pandas_home="/home/pandas" +RUN apt-get update && apt-get -y upgrade +RUN apt-get install -y build-essential bash-completion -# Avoid warnings by switching to noninteractive -ENV DEBIAN_FRONTEND=noninteractive +# hdf5 needed for pytables installation +# libgles2-mesa needed for pytest-qt +RUN apt-get install -y libhdf5-dev libgles2-mesa-dev -# Configure apt and install packages -RUN apt-get update \ - && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ - # - # Install tzdata and configure timezone (fix for tests which try to read from "/etc/localtime") - && apt-get -y install tzdata \ - && ln -fs /usr/share/zoneinfo/Etc/UTC /etc/localtime \ - && dpkg-reconfigure -f noninteractive tzdata \ - # - # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed - && apt-get -y install git iproute2 procps iproute2 lsb-release \ - # - # cleanup - && apt-get autoremove -y \ - && apt-get clean -y \ - && rm -rf /var/lib/apt/lists/* +RUN python -m pip install --upgrade pip +COPY requirements-dev.txt /tmp +RUN python -m pip install -r /tmp/requirements-dev.txt +RUN git config --global --add safe.directory /home/pandas -# Switch back to dialog for any ad-hoc use of apt-get -ENV DEBIAN_FRONTEND=dialog - -# Clone pandas repo -RUN mkdir "$pandas_home" \ - && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \ - && cd "$pandas_home" \ - && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ - && git pull upstream main - -# Because it is surprisingly difficult to activate a conda environment inside a DockerFile -# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), -# we just update the base/root one from the 'environment.yml' file instead of creating a new one. -# -# Set up environment -RUN conda install -y mamba -RUN mamba env update -n base -f "$pandas_home/environment.yml" - -# Build C extensions and pandas -SHELL ["/bin/bash", "-c"] -RUN . /opt/conda/etc/profile.d/conda.sh \ - && conda activate base \ - && cd "$pandas_home" \ - && export \ - && python setup.py build_ext -j 4 \ - && python -m pip install --no-build-isolation -e . +ENV SHELL="/bin/bash" +CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE index d4e49a140f1cb..c343da2ebe870 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ BSD 3-Clause License Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. -Copyright (c) 2011-2022, Open source contributors. +Copyright (c) 2011-2025, Open source contributors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/LICENSES/BOTTLENECK_LICENCE b/LICENSES/BOTTLENECK_LICENCE new file mode 100644 index 0000000000000..f4bdbb1647ee6 --- /dev/null +++ b/LICENSES/BOTTLENECK_LICENCE @@ -0,0 +1,25 @@ +Copyright (c) 2010-2019 Keith Goodman +Copyright (c) 2019 Bottleneck Developers +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/DATEUTIL_LICENSE b/LICENSES/DATEUTIL_LICENSE index 6053d35cfc60b..1e65815cf0b31 100644 --- a/LICENSES/DATEUTIL_LICENSE +++ b/LICENSES/DATEUTIL_LICENSE @@ -51,4 +51,4 @@ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -The above BSD License Applies to all code, even that also covered by Apache 2.0. +The above BSD License Applies to all code, even that also covered by Apache 2.0. \ No newline at end of file diff --git a/LICENSES/KLIB_LICENSE b/LICENSES/KLIB_LICENSE index 0a996fae3360f..2de13e402643b 100644 --- a/LICENSES/KLIB_LICENSE +++ b/LICENSES/KLIB_LICENSE @@ -20,4 +20,4 @@ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/LICENSES/MUSL_LICENSE b/LICENSES/MUSL_LICENSE index a8833d4bc4744..5ff71b5f5a7fe 100644 --- a/LICENSES/MUSL_LICENSE +++ b/LICENSES/MUSL_LICENSE @@ -1,7 +1,7 @@ musl as a whole is licensed under the following standard MIT license: ---------------------------------------------------------------------- -Copyright © 2005-2014 Rich Felker, et al. +Copyright © 2005-2020 Rich Felker, et al. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -25,37 +25,88 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Authors/contributors include: +A. Wilcox +Ada Worcester +Alex Dowad +Alex Suykov +Alexander Monakov +Andre McCurdy +Andrew Kelley Anthony G. Basile +Aric Belsito Arvid Picciani +Bartosz Brachaczek +Benjamin Peterson Bobby Bingham Boris Brezillon Brent Cook Chris Spiegel Clément Vasseur +Daniel Micay +Daniel Sabogal +Daurnimator +David Carlier +David Edelsohn +Denys Vlasenko +Dmitry Ivanov +Dmitry V. Levin +Drew DeVault Emil Renner Berthing +Fangrui Song +Felix Fietkau +Felix Janda +Gianluca Anzolin +Hauke Mehrtens +He X Hiltjo Posthuma Isaac Dunham +Jaydeep Patil Jens Gustedt Jeremy Huntwork +Jo-Philipp Wich +Joakim Sindholt John Spencer +Julien Ramseier Justin Cormack +Kaarle Ritvanen +Khem Raj +Kylie McClain +Leah Neukirchen Luca Barbato Luka Perkov M Farkas-Dyck (Strake) +Mahesh Bodapati +Markus Wichmann +Masanori Ogino +Michael Clark Michael Forney +Mikhail Kremnyov +Natanael Copa Nicholas J. Kain orc Pascal Cuoq +Patrick Oppenlander +Petr Hosek +Petr Skocik Pierre Carrier +Reini Urban Rich Felker Richard Pennington +Ryan Fairfax +Samuel Holland +Segev Finer +Shiz sin Solar Designer Stefan Kristiansson +Stefan O'Rear Szabolcs Nagy Timo Teräs +Trutz Behn Valentin Ochs +Will Dietz William Haddon +William Pitcock Portions of this software are derived from third-party works licensed under terms compatible with the above MIT license: @@ -71,18 +122,22 @@ Copyright © 1993,2004 Sun Microsystems or Copyright © 2003-2011 David Schultz or Copyright © 2003-2009 Steven G. Kargl or Copyright © 2003-2009 Bruce D. Evans or -Copyright © 2008 Stephen L. Moshier +Copyright © 2008 Stephen L. Moshier or +Copyright © 2017-2018 Arm Limited and labelled as such in comments in the individual source files. All have been licensed under extremely permissive terms. -The ARM memcpy code (src/string/armel/memcpy.s) is Copyright © 2008 +The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008 The Android Open Source Project and is licensed under a two-clause BSD license. It was taken from Bionic libc, used on Android. -The implementation of DES for crypt (src/misc/crypt_des.c) is +The AArch64 memcpy and memset code (src/string/aarch64/*) are +Copyright © 1999-2019, Arm Limited. + +The implementation of DES for crypt (src/crypt/crypt_des.c) is Copyright © 1994 David Burren. It is licensed under a BSD license. -The implementation of blowfish crypt (src/misc/crypt_blowfish.c) was +The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was originally written by Solar Designer and placed into the public domain. The code also comes with a fallback permissive license for use in jurisdictions that may not recognize the public domain. @@ -90,22 +145,17 @@ in jurisdictions that may not recognize the public domain. The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011 Valentin Ochs and is licensed under an MIT-style license. -The BSD PRNG implementation (src/prng/random.c) and XSI search API -(src/search/*.c) functions are Copyright © 2011 Szabolcs Nagy and -licensed under following terms: "Permission to use, copy, modify, -and/or distribute this code for any purpose with or without fee is -hereby granted. There is no warranty." - -The x86_64 port was written by Nicholas J. Kain. Several files (crt) -were released into the public domain; others are licensed under the -standard MIT license terms at the top of this file. See individual -files for their copyright status. +The x86_64 port was written by Nicholas J. Kain and is licensed under +the standard MIT terms. The mips and microblaze ports were originally written by Richard Pennington for use in the ellcc project. The original code was adapted by Rich Felker for build system and code conventions during upstream integration. It is licensed under the standard MIT terms. +The mips64 port was contributed by Imagination Technologies and is +licensed under the standard MIT terms. + The powerpc port was also originally written by Richard Pennington, and later supplemented and integrated by John Spencer. It is licensed under the standard MIT terms. @@ -118,15 +168,26 @@ can be found in the git version control history of the project. The omission of copyright and license comments in each file is in the interest of source tree size. -All public header files (include/* and arch/*/bits/*) should be -treated as Public Domain as they intentionally contain no content -which can be covered by copyright. Some source modules may fall in -this category as well. If you believe that a file is so trivial that -it should be in the Public Domain, please contact the authors and -request an explicit statement releasing it from copyright. +In addition, permission is hereby granted for all public header files +(include/* and arch/*/bits/*) and crt files intended to be linked into +applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit +the copyright notice and permission notice otherwise required by the +license, and to use these files without any requirement of +attribution. These files include substantial contributions from: + +Bobby Bingham +John Spencer +Nicholas J. Kain +Rich Felker +Richard Pennington +Stefan Kristiansson +Szabolcs Nagy -The following files are trivial, believed not to be copyrightable in -the first place, and hereby explicitly released to the Public Domain: +all of whom have explicitly granted such permission. -All public headers: include/*, arch/*/bits/* -Startup files: crt/* +This file previously contained text expressing a belief that most of +the files covered by the above exception were sufficiently trivial not +to be subject to copyright, resulting in confusion over whether it +negated the permissions granted in the license. In the spirit of +permissive licensing, and of not having licensing issues being an +obstacle to adoption, that text has been removed. \ No newline at end of file diff --git a/LICENSES/NUMPY_LICENSE b/LICENSES/NUMPY_LICENSE index 7e972cff80759..f2d647bf0bc48 100644 --- a/LICENSES/NUMPY_LICENSE +++ b/LICENSES/NUMPY_LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2005-2011, NumPy Developers. +Copyright (c) 2005-2023, NumPy Developers. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -27,4 +27,4 @@ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/OTHER b/LICENSES/OTHER deleted file mode 100644 index 7446d68eb43a6..0000000000000 --- a/LICENSES/OTHER +++ /dev/null @@ -1,75 +0,0 @@ -Bottleneck license ------------------- - -Copyright (c) 2010-2012 Archipel Asset Management AB. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - -google-api-python-client license --------------------------------- - -Copyright (C) 2012 Google Inc. -All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Pyperclip v1.3 license ----------------------- - -Copyright (c) 2010, Albert Sweigart -All rights reserved. - -BSD-style license: - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the pyperclip nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY Albert Sweigart "AS IS" AND ANY -EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL Albert Sweigart BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/PACKAGING_LICENSE b/LICENSES/PACKAGING_LICENSE index 4216ea1ce2379..2bfcd5297c470 100644 --- a/LICENSES/PACKAGING_LICENSE +++ b/LICENSES/PACKAGING_LICENSE @@ -199,4 +199,4 @@ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/PSF_LICENSE b/LICENSES/PSF_LICENSE index 5cdb01e8d24af..f26bcf4d2de6e 100644 --- a/LICENSES/PSF_LICENSE +++ b/LICENSES/PSF_LICENSE @@ -2,25 +2,24 @@ A. HISTORY OF THE SOFTWARE ========================== Python was created in the early 1990s by Guido van Rossum at Stichting -Mathematisch Centrum (CWI, see http://www.cwi.nl) in the Netherlands +Mathematisch Centrum (CWI, see https://www.cwi.nl) in the Netherlands as a successor of a language called ABC. Guido remains Python's principal author, although it includes many contributions from others. In 1995, Guido continued his work on Python at the Corporation for -National Research Initiatives (CNRI, see http://www.cnri.reston.va.us) +National Research Initiatives (CNRI, see https://www.cnri.reston.va.us) in Reston, Virginia where he released several versions of the software. In May 2000, Guido and the Python core development team moved to BeOpen.com to form the BeOpen PythonLabs team. In October of the same -year, the PythonLabs team moved to Digital Creations (now Zope -Corporation, see http://www.zope.com). In 2001, the Python Software -Foundation (PSF, see http://www.python.org/psf/) was formed, a -non-profit organization created specifically to own Python-related -Intellectual Property. Zope Corporation is a sponsoring member of -the PSF. - -All Python releases are Open Source (see http://www.opensource.org for +year, the PythonLabs team moved to Digital Creations, which became +Zope Corporation. In 2001, the Python Software Foundation (PSF, see +https://www.python.org/psf/) was formed, a non-profit organization +created specifically to own Python-related Intellectual Property. +Zope Corporation was a sponsoring member of the PSF. + +All Python releases are Open Source (see https://opensource.org for the Open Source Definition). Historically, most, but not all, Python releases have also been GPL-compatible; the table below summarizes the various releases. @@ -36,34 +35,9 @@ the various releases. 2.1 2.0+1.6.1 2001 PSF no 2.0.1 2.0+1.6.1 2001 PSF yes 2.1.1 2.1+2.0.1 2001 PSF yes - 2.2 2.1.1 2001 PSF yes 2.1.2 2.1.1 2002 PSF yes 2.1.3 2.1.2 2002 PSF yes - 2.2.1 2.2 2002 PSF yes - 2.2.2 2.2.1 2002 PSF yes - 2.2.3 2.2.2 2003 PSF yes - 2.3 2.2.2 2002-2003 PSF yes - 2.3.1 2.3 2002-2003 PSF yes - 2.3.2 2.3.1 2002-2003 PSF yes - 2.3.3 2.3.2 2002-2003 PSF yes - 2.3.4 2.3.3 2004 PSF yes - 2.3.5 2.3.4 2005 PSF yes - 2.4 2.3 2004 PSF yes - 2.4.1 2.4 2005 PSF yes - 2.4.2 2.4.1 2005 PSF yes - 2.4.3 2.4.2 2006 PSF yes - 2.4.4 2.4.3 2006 PSF yes - 2.5 2.4 2006 PSF yes - 2.5.1 2.5 2007 PSF yes - 2.5.2 2.5.1 2008 PSF yes - 2.5.3 2.5.2 2008 PSF yes - 2.6 2.5 2008 PSF yes - 2.6.1 2.6 2008 PSF yes - 2.6.2 2.6.1 2009 PSF yes - 2.6.3 2.6.2 2009 PSF yes - 2.6.4 2.6.3 2009 PSF yes - 2.6.5 2.6.4 2010 PSF yes - 2.7 2.6 2010 PSF yes + 2.2 and above 2.1.1 2001-now PSF yes Footnotes: @@ -85,6 +59,17 @@ direction to make these releases possible. B. TERMS AND CONDITIONS FOR ACCESSING OR OTHERWISE USING PYTHON =============================================================== +Python software and documentation are licensed under the +Python Software Foundation License Version 2. + +Starting with Python 3.8.6, examples, recipes, and other code in +the documentation are dual licensed under the PSF License Version 2 +and the Zero-Clause BSD license. + +Some software incorporated into Python is under different licenses. +The licenses are listed with code falling under that license. + + PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 -------------------------------------------- @@ -98,9 +83,10 @@ grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use Python alone or in any derivative version, provided, however, that PSF's License Agreement and PSF's notice of copyright, -i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 -Python Software Foundation; All Rights Reserved" are retained in Python alone or -in any derivative version prepared by Licensee. +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative version +prepared by Licensee. 3. In the event Licensee prepares a derivative work that is based on or incorporates Python or any part thereof, and wants to make @@ -205,9 +191,9 @@ version prepared by Licensee. Alternately, in lieu of CNRI's License Agreement, Licensee may substitute the following text (omitting the quotes): "Python 1.6.1 is made available subject to the terms and conditions in CNRI's License Agreement. This Agreement together with -Python 1.6.1 may be located on the Internet using the following +Python 1.6.1 may be located on the internet using the following unique, persistent identifier (known as a handle): 1895.22/1013. This -Agreement may also be obtained from a proxy server on the Internet +Agreement may also be obtained from a proxy server on the internet using the following URL: http://hdl.handle.net/1895.22/1013". 3. In the event Licensee prepares a derivative work that is based on @@ -277,3 +263,17 @@ FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +ZERO-CLAUSE BSD LICENSE FOR CODE IN THE PYTHON DOCUMENTATION +---------------------------------------------------------------------- + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. diff --git a/LICENSES/PYPERCLIP_LICENSE b/LICENSES/PYPERCLIP_LICENSE new file mode 100644 index 0000000000000..07cc746cd5ad6 --- /dev/null +++ b/LICENSES/PYPERCLIP_LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2014, Al Sweigart +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the {organization} nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/LICENSES/PYUPGRADE_LICENSE b/LICENSES/PYUPGRADE_LICENSE index 522fbe20b8991..edeac73dade04 100644 --- a/LICENSES/PYUPGRADE_LICENSE +++ b/LICENSES/PYUPGRADE_LICENSE @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. +THE SOFTWARE. \ No newline at end of file diff --git a/LICENSES/SAS7BDAT_LICENSE b/LICENSES/SAS7BDAT_LICENSE index 8fbf194013e93..94b7fe934e85c 100644 --- a/LICENSES/SAS7BDAT_LICENSE +++ b/LICENSES/SAS7BDAT_LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2015 Jared Hobbs +Copyright (c) 2015-2019 Jared Hobbs Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. +SOFTWARE. \ No newline at end of file diff --git a/LICENSES/SCIPY_LICENSE b/LICENSES/SCIPY_LICENSE deleted file mode 100644 index d887ce5f9890f..0000000000000 --- a/LICENSES/SCIPY_LICENSE +++ /dev/null @@ -1,31 +0,0 @@ -Copyright (c) 2001, 2002 Enthought, Inc. -All rights reserved. - -Copyright (c) 2003-2012 SciPy Developers. -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - a. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - b. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - c. Neither the name of Enthought nor the names of the SciPy Developers - may be used to endorse or promote products derived from this software - without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. - diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index a905fb017d813..58fc9dfc0a35b 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -1,21 +1,22 @@ -Copyright (c) 2011-2013, ESN Social Software AB and Jonas Tarnstrom +Developed by ESN, an Electronic Arts Inc. studio. +Copyright (c) 2014, Electronic Arts Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the ESN Social Software AB nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of ESN, Electronic Arts Inc. nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE +DISCLAIMED. IN NO EVENT SHALL ELECTRONIC ARTS INC. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND @@ -23,12 +24,91 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +---- Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. -Numeric decoder derived from TCL library -http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms + Copyright 2005, 2006, 2007 + Nick Galbreath -- nickg [at] modp [dot] com + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + Neither the name of the modp.com nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This is the standard "new" BSD license: + http://www.opensource.org/licenses/bsd-license.php + +https://github.com/client9/stringencoders/blob/cfd5c1507325ae497ea9bacdacba12c0ffd79d30/COPYING + +---- + +Numeric decoder derived from from TCL library +https://opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. * Copyright (c) 1994 Sun Microsystems, Inc. + + This software is copyrighted by the Regents of the University of + California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState + Corporation and other parties. The following terms apply to all files + associated with the software unless explicitly disclaimed in + individual files. + + The authors hereby grant permission to use, copy, modify, distribute, + and license this software and its documentation for any purpose, provided + that existing copyright notices are retained in all copies and that this + notice is included verbatim in any distributions. No written agreement, + license, or royalty fee is required for any of the authorized uses. + Modifications to this software may be copyrighted by their authors + and need not follow the licensing terms described here, provided that + the new terms are clearly indicated on the first page of each file where + they apply. + + IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY + FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES + ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY + DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + + THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE + IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE + NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR + MODIFICATIONS. + + GOVERNMENT USE: If you are acquiring this software on behalf of the + U.S. government, the Government shall have only "Restricted Rights" + in the software and related documentation as defined in the Federal + Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you + are acquiring the software on behalf of the Department of Defense, the + software shall be classified as "Commercial Computer Software" and the + Government shall have only "Restricted Rights" as defined in Clause + 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the + authors grant the U.S. Government and others acting in its behalf + permission to use and distribute the software in accordance with the + terms specified in this license. \ No newline at end of file diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE deleted file mode 100644 index 6bafeb9d3d80e..0000000000000 --- a/LICENSES/XARRAY_LICENSE +++ /dev/null @@ -1,195 +0,0 @@ -Copyright 2014-2019, xarray Developers - --------------------------------------------------------------------------------- - -Apache License -Version 2.0, January 2004 -http://www.apache.org/licenses/ - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - -"License" shall mean the terms and conditions for use, reproduction, and -distribution as defined by Sections 1 through 9 of this document. - -"Licensor" shall mean the copyright owner or entity authorized by the copyright -owner that is granting the License. - -"Legal Entity" shall mean the union of the acting entity and all other entities -that control, are controlled by, or are under common control with that entity. -For the purposes of this definition, "control" means (i) the power, direct or -indirect, to cause the direction or management of such entity, whether by -contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the -outstanding shares, or (iii) beneficial ownership of such entity. - -"You" (or "Your") shall mean an individual or Legal Entity exercising -permissions granted by this License. - -"Source" form shall mean the preferred form for making modifications, including -but not limited to software source code, documentation source, and configuration -files. - -"Object" form shall mean any form resulting from mechanical transformation or -translation of a Source form, including but not limited to compiled object code, -generated documentation, and conversions to other media types. - -"Work" shall mean the work of authorship, whether in Source or Object form, made -available under the License, as indicated by a copyright notice that is included -in or attached to the work (an example is provided in the Appendix below). - -"Derivative Works" shall mean any work, whether in Source or Object form, that -is based on (or derived from) the Work and for which the editorial revisions, -annotations, elaborations, or other modifications represent, as a whole, an -original work of authorship. For the purposes of this License, Derivative Works -shall not include works that remain separable from, or merely link (or bind by -name) to the interfaces of, the Work and Derivative Works thereof. - -"Contribution" shall mean any work of authorship, including the original version -of the Work and any modifications or additions to that Work or Derivative Works -thereof, that is intentionally submitted to Licensor for inclusion in the Work -by the copyright owner or by an individual or Legal Entity authorized to submit -on behalf of the copyright owner. For the purposes of this definition, -"submitted" means any form of electronic, verbal, or written communication sent -to the Licensor or its representatives, including but not limited to -communication on electronic mailing lists, source code control systems, and -issue tracking systems that are managed by, or on behalf of, the Licensor for -the purpose of discussing and improving the Work, but excluding communication -that is conspicuously marked or otherwise designated in writing by the copyright -owner as "Not a Contribution." - -"Contributor" shall mean Licensor and any individual or Legal Entity on behalf -of whom a Contribution has been received by Licensor and subsequently -incorporated within the Work. - -2. Grant of Copyright License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable copyright license to reproduce, prepare Derivative Works of, -publicly display, publicly perform, sublicense, and distribute the Work and such -Derivative Works in Source or Object form. - -3. Grant of Patent License. - -Subject to the terms and conditions of this License, each Contributor hereby -grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -irrevocable (except as stated in this section) patent license to make, have -made, use, offer to sell, sell, import, and otherwise transfer the Work, where -such license applies only to those patent claims licensable by such Contributor -that are necessarily infringed by their Contribution(s) alone or by combination -of their Contribution(s) with the Work to which such Contribution(s) was -submitted. If You institute patent litigation against any entity (including a -cross-claim or counterclaim in a lawsuit) alleging that the Work or a -Contribution incorporated within the Work constitutes direct or contributory -patent infringement, then any patent licenses granted to You under this License -for that Work shall terminate as of the date such litigation is filed. - -4. Redistribution. - -You may reproduce and distribute copies of the Work or Derivative Works thereof -in any medium, with or without modifications, and in Source or Object form, -provided that You meet the following conditions: - -You must give any other recipients of the Work or Derivative Works a copy of -this License; and -You must cause any modified files to carry prominent notices stating that You -changed the files; and -You must retain, in the Source form of any Derivative Works that You distribute, -all copyright, patent, trademark, and attribution notices from the Source form -of the Work, excluding those notices that do not pertain to any part of the -Derivative Works; and -If the Work includes a "NOTICE" text file as part of its distribution, then any -Derivative Works that You distribute must include a readable copy of the -attribution notices contained within such NOTICE file, excluding those notices -that do not pertain to any part of the Derivative Works, in at least one of the -following places: within a NOTICE text file distributed as part of the -Derivative Works; within the Source form or documentation, if provided along -with the Derivative Works; or, within a display generated by the Derivative -Works, if and wherever such third-party notices normally appear. The contents of -the NOTICE file are for informational purposes only and do not modify the -License. You may add Your own attribution notices within Derivative Works that -You distribute, alongside or as an addendum to the NOTICE text from the Work, -provided that such additional attribution notices cannot be construed as -modifying the License. -You may add Your own copyright statement to Your modifications and may provide -additional or different license terms and conditions for use, reproduction, or -distribution of Your modifications, or for any such Derivative Works as a whole, -provided Your use, reproduction, and distribution of the Work otherwise complies -with the conditions stated in this License. - -5. Submission of Contributions. - -Unless You explicitly state otherwise, any Contribution intentionally submitted -for inclusion in the Work by You to the Licensor shall be under the terms and -conditions of this License, without any additional terms or conditions. -Notwithstanding the above, nothing herein shall supersede or modify the terms of -any separate license agreement you may have executed with Licensor regarding -such Contributions. - -6. Trademarks. - -This License does not grant permission to use the trade names, trademarks, -service marks, or product names of the Licensor, except as required for -reasonable and customary use in describing the origin of the Work and -reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. - -Unless required by applicable law or agreed to in writing, Licensor provides the -Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, -including, without limitation, any warranties or conditions of TITLE, -NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are -solely responsible for determining the appropriateness of using or -redistributing the Work and assume any risks associated with Your exercise of -permissions under this License. - -8. Limitation of Liability. - -In no event and under no legal theory, whether in tort (including negligence), -contract, or otherwise, unless required by applicable law (such as deliberate -and grossly negligent acts) or agreed to in writing, shall any Contributor be -liable to You for damages, including any direct, indirect, special, incidental, -or consequential damages of any character arising as a result of this License or -out of the use or inability to use the Work (including but not limited to -damages for loss of goodwill, work stoppage, computer failure or malfunction, or -any and all other commercial damages or losses), even if such Contributor has -been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. - -While redistributing the Work or Derivative Works thereof, You may choose to -offer, and charge a fee for, acceptance of support, warranty, indemnity, or -other liability obligations and/or rights consistent with this License. However, -in accepting such obligations, You may act only on Your own behalf and on Your -sole responsibility, not on behalf of any other Contributor, and only if You -agree to indemnify, defend, and hold each Contributor harmless for any liability -incurred by, or claims asserted against, such Contributor by reason of your -accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work - -To apply the Apache License to your work, attach the following boilerplate -notice, with the fields enclosed by brackets "[]" replaced with your own -identifying information. (Don't include the brackets!) The text should be -enclosed in the appropriate comment syntax for the file format. We also -recommend that a file or class name and description of purpose be included on -the same "printed page" as the copyright notice for easier identification within -third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in index d2b1b8cb887bc..c59151f340545 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,3 @@ -include RELEASE.md -include versioneer.py - graft doc prune doc/build @@ -10,6 +7,7 @@ graft pandas global-exclude *.bz2 global-exclude *.csv +global-exclude *.data global-exclude *.dta global-exclude *.feather global-exclude *.tar @@ -18,9 +16,12 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl +global-exclude *.kml global-exclude *.msgpack global-exclude *.pdf +global-exclude *.parquet global-exclude *.pickle +global-exclude *.pkl global-exclude *.png global-exclude *.pptx global-exclude *.ods @@ -29,12 +30,15 @@ global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so +global-exclude *.txt global-exclude *.xls global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt global-exclude *.cpt +global-exclude *.xml +global-exclude *.xsl global-exclude *.xz global-exclude *.zip global-exclude *.zst @@ -57,4 +61,9 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src -graft pandas/_libs/tslibs/src +graft pandas/_libs/include + +# Include cibw script in sdist since it's needed for building wheels +include scripts/cibw_before_build.sh +include scripts/cibw_before_build_windows.sh +include scripts/cibw_before_test_windows.sh diff --git a/Makefile b/Makefile deleted file mode 100644 index c0aa685ed47ac..0000000000000 --- a/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -.PHONY : develop build clean clean_pyc doc lint-diff black test-scripts - -all: develop - -clean: - -python setup.py clean - -clean_pyc: - -find . -name '*.py[co]' -exec rm {} \; - -build: clean_pyc - python setup.py build_ext - -lint-diff: - git diff upstream/main --name-only -- "*.py" | xargs flake8 - -black: - black . - -develop: build - python -m pip install --no-build-isolation -e . - -doc: - -rm -rf doc/build doc/source/generated - cd doc; \ - python make.py clean; \ - python make.py html - -test-scripts: - pytest scripts diff --git a/README.md b/README.md index aaf63ead9c416..1a273fdb896c5 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,18 @@ -
-
-
+ + + Pandas Logo + ----------------- # pandas: powerful Python data analysis toolkit -[![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) -[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/) -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) -[![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/) -[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) -[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas) -[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%20per%20month)](https://pepy.tech/project/pandas) -[![Gitter](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas) -[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) + +| | | +| --- | --- | +| Testing | [![CI - Test](https://github.com/pandas-dev/pandas/actions/workflows/unit-tests.yml/badge.svg)](https://github.com/pandas-dev/pandas/actions/workflows/unit-tests.yml) [![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas) | +| Package | [![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/) [![PyPI Downloads](https://img.shields.io/pypi/dm/pandas.svg?label=PyPI%20downloads)](https://pypi.org/project/pandas/) [![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/conda-forge/pandas) [![Conda Downloads](https://img.shields.io/conda/dn/conda-forge/pandas.svg?label=Conda%20downloads)](https://anaconda.org/conda-forge/pandas) | +| Meta | [![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](https://numfocus.org) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134) [![License - BSD 3-Clause](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE) [![Slack](https://img.shields.io/badge/join_Slack-information-brightgreen.svg?logo=slack)](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) | + ## What is it? @@ -27,6 +24,19 @@ the broader goal of becoming **the most powerful and flexible open source data analysis / manipulation tool available in any language**. It is already well on its way towards this goal. +## Table of Contents + +- [Main Features](#main-features) +- [Where to get it](#where-to-get-it) +- [Dependencies](#dependencies) +- [Installation from sources](#installation-from-sources) +- [License](#license) +- [Documentation](#documentation) +- [Background](#background) +- [Getting Help](#getting-help) +- [Discussion and Development](#discussion-and-development) +- [Contributing to pandas](#contributing-to-pandas) + ## Main Features Here are just a few of the things that pandas does well: @@ -86,11 +96,11 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/). +Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://anaconda.org/conda-forge/pandas). ```sh # conda -conda install pandas +conda install -c conda-forge pandas ``` ```sh @@ -98,6 +108,10 @@ conda install pandas pip install pandas ``` +The list of changes to pandas between each release can be found +[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full +details, see the commit logs at https://github.com/pandas-dev/pandas. + ## Dependencies - [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org) - [python-dateutil - Provides powerful extensions to the standard datetime module](https://dateutil.readthedocs.io/en/stable/index.html) @@ -117,31 +131,23 @@ In the `pandas` directory (same one where you found this file after cloning the git repo), execute: ```sh -python setup.py install +pip install . ``` or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_install/#install-editable): ```sh -python -m pip install -e . --no-build-isolation --no-use-pep517 -``` - -If you have `make`, you can also use `make develop` to run the same command. - -or alternatively - -```sh -python setup.py develop +python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true ``` -See the full instructions for [installing from source](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-from-source). +See the full instructions for [installing from source](https://pandas.pydata.org/docs/dev/development/contributing_environment.html). ## License [BSD 3](LICENSE) ## Documentation -The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable +The official documentation is hosted on [PyData.org](https://pandas.pydata.org/pandas-docs/stable/). ## Background Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and @@ -153,9 +159,17 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. +Most development discussions take place on GitHub in this repo, via the [GitHub issue tracker](https://github.com/pandas-dev/pandas/issues). + +Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions. + +There are also frequent [community meetings](https://pandas.pydata.org/docs/dev/development/community.html#community-meeting) for project maintainers open to the community as well as monthly [new contributor meetings](https://pandas.pydata.org/docs/dev/development/community.html#new-contributor-meeting) to help support new contributors. -## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) +Additional information on the communication channels can be found on the [contributor community](https://pandas.pydata.org/docs/development/community.html) page. + +## Contributing to pandas + +[![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. @@ -167,6 +181,10 @@ You can also triage issues which may include reproducing bug reports, or asking Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! -Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). +Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack). As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md) + +
+ +[Go to Top](#table-of-contents) diff --git a/RELEASE.md b/RELEASE.md deleted file mode 100644 index 344a097a3e81e..0000000000000 --- a/RELEASE.md +++ /dev/null @@ -1,6 +0,0 @@ -Release Notes -============= - -The list of changes to pandas between each release can be found -[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full -details, see the commit logs at https://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 46640505a4c84..30c692115eab1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -29,7 +29,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.8"], + "pythons": ["3.10"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -41,8 +41,8 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "numpy": [], - "Cython": ["0.29.30"], + "pip+build": [], + "Cython": ["3.0"], "matplotlib": [], "sqlalchemy": [], "scipy": [], @@ -54,11 +54,13 @@ "openpyxl": [], "xlsxwriter": [], "xlrd": [], - "xlwt": [], "odfpy": [], "jinja2": [], + "meson": [], + "meson-python": [], + "python-build": [], }, - "conda_channels": ["defaults", "conda-forge"], + "conda_channels": ["conda-forge"], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. @@ -126,6 +128,5 @@ "regression_thresholds": { }, "build_command": - ["python setup.py build -j4", - "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], + ["python -m build -Cbuilddir=builddir --wheel --outdir {build_cache_dir} {build_dir}"] } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 0008a589ca71f..933e8fbc175d8 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -4,8 +4,6 @@ import pandas as pd -from .pandas_vb_common import tm - for imp in ["pandas.util", "pandas.tools.hashing"]: try: hashing = import_module(imp) @@ -15,15 +13,15 @@ class Factorize: - params = [ [True, False], [True, False], [ - "int", - "uint", - "float", + "int64", + "uint64", + "float64", "object", + "object_str", "datetime64[ns]", "datetime64[ns, tz]", "Int64", @@ -35,27 +33,27 @@ class Factorize: def setup(self, unique, sort, dtype): N = 10**5 - string_index = tm.makeStringIndex(N) - string_arrow = None - if dtype == "string[pyarrow]": - try: - string_arrow = pd.array(string_index, dtype="string[pyarrow]") - except ImportError: - raise NotImplementedError - - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "object": string_index, - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - "Int64": pd.array(np.arange(N), dtype="Int64"), - "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), - "string[pyarrow]": string_arrow, - }[dtype] + + if dtype in ["int64", "uint64", "Int64", "object"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype=dtype) + elif dtype == "boolean": + data = pd.array(np.random.randint(0, 2, N), dtype=dtype) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype == "object_str": + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "string[pyarrow]": + data = pd.array( + pd.Index([f"i-{i}" for i in range(N)], dtype=object), + dtype="string[pyarrow]", + ) + else: + raise NotImplementedError + if not unique: data = data.repeat(5) self.data = data @@ -63,28 +61,43 @@ def setup(self, unique, sort, dtype): def time_factorize(self, unique, sort, dtype): pd.factorize(self.data, sort=sort) + def peakmem_factorize(self, unique, sort, dtype): + pd.factorize(self.data, sort=sort) -class Duplicated: +class Duplicated: params = [ [True, False], ["first", "last", False], - ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"], + [ + "int64", + "uint64", + "float64", + "string", + "datetime64[ns]", + "datetime64[ns, tz]", + "timestamp[ms][pyarrow]", + "duration[s][pyarrow]", + ], ] param_names = ["unique", "keep", "dtype"] def setup(self, unique, keep, dtype): N = 10**5 - data = { - "int": pd.Index(np.arange(N), dtype="int64"), - "uint": pd.Index(np.arange(N), dtype="uint64"), - "float": pd.Index(np.random.randn(N), dtype="float64"), - "string": tm.makeStringIndex(N), - "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), - "datetime64[ns, tz]": pd.date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" - ), - }[dtype] + if dtype in ["int64", "uint64"]: + data = pd.Index(np.arange(N), dtype=dtype) + elif dtype == "float64": + data = pd.Index(np.random.randn(N), dtype="float64") + elif dtype == "string": + data = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "datetime64[ns]": + data = pd.date_range("2011-01-01", freq="h", periods=N) + elif dtype == "datetime64[ns, tz]": + data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo") + elif dtype in ["timestamp[ms][pyarrow]", "duration[s][pyarrow]"]: + data = pd.Index(np.arange(N), dtype=dtype) + else: + raise NotImplementedError if not unique: data = data.repeat(5) self.idx = data @@ -95,6 +108,28 @@ def time_duplicated(self, unique, keep, dtype): self.idx.duplicated(keep=keep) +class DuplicatedMaskedArray: + params = [ + [True, False], + ["first", "last", False], + ["Int64", "Float64"], + ] + param_names = ["unique", "keep", "dtype"] + + def setup(self, unique, keep, dtype): + N = 10**5 + data = pd.Series(np.arange(N), dtype=dtype) + data[list(range(1, N, 100))] = pd.NA + if not unique: + data = data.repeat(5) + self.ser = data + # cache is_unique + self.ser.is_unique + + def time_duplicated(self, unique, keep, dtype): + self.ser.duplicated(keep=keep) + + class Hashing: def setup_cache(self): N = 10**5 @@ -102,7 +137,9 @@ def setup_cache(self): df = pd.DataFrame( { "strings": pd.Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N)) + pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=N) + ) ), "floats": np.random.randn(N), "ints": np.arange(N), @@ -140,21 +177,22 @@ class Quantile: params = [ [0, 0.5, 1], ["linear", "nearest", "lower", "higher", "midpoint"], - ["float", "int", "uint"], + ["float64", "int64", "uint64"], ] param_names = ["quantile", "interpolation", "dtype"] def setup(self, quantile, interpolation, dtype): N = 10**5 - data = { - "int": np.arange(N), - "uint": np.arange(N).astype(np.uint64), - "float": np.random.randn(N), - } - self.idx = pd.Series(data[dtype].repeat(5)) + if dtype in ["int64", "uint64"]: + data = np.arange(N, dtype=dtype) + elif dtype == "float64": + data = np.random.randn(N) + else: + raise NotImplementedError + self.ser = pd.Series(data.repeat(5)) def time_quantile(self, quantile, interpolation, dtype): - self.idx.quantile(quantile, interpolation=interpolation) + self.ser.quantile(quantile, interpolation=interpolation) class SortIntegerArray: diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 16d90b9d23741..a17732c70c2c7 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -8,11 +8,8 @@ date_range, ) -from ..pandas_vb_common import tm - class IsIn: - params = [ "int64", "uint64", @@ -61,9 +58,12 @@ def setup(self, dtype): elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: - self.series = Series(tm.makeStringIndex(N), dtype=dtype) - except ImportError: - raise NotImplementedError + self.series = Series( + Index([f"i-{i}" for i in range(N)], dtype=object)._values, + dtype=dtype, + ) + except ImportError as err: + raise NotImplementedError from err self.values = list(self.series[:2]) else: @@ -183,7 +183,6 @@ def time_isin(self, dtype, M, offset_factor): class IsInFloat64: - params = [ [np.float64, "Float64"], ["many_different_values", "few_different_values", "only_nans_values"], @@ -249,7 +248,7 @@ def setup(self, series_type, vals_type): elif series_type == "long": ser_vals = np.arange(N_many) elif series_type == "long_floats": - ser_vals = np.arange(N_many, dtype=np.float_) + ser_vals = np.arange(N_many, dtype=np.float64) self.series = Series(ser_vals).astype(object) @@ -260,7 +259,7 @@ def setup(self, series_type, vals_type): elif vals_type == "long": values = np.arange(N_many) elif vals_type == "long_floats": - values = np.arange(N_many, dtype=np.float_) + values = np.arange(N_many, dtype=np.float64) self.values = values.astype(object) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 496db66c78569..6b1f75187f887 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -6,13 +6,12 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, Timestamp, date_range, to_timedelta, ) -import pandas._testing as tm -from pandas.core.algorithms import checked_add_with_arr from .pandas_vb_common import numeric_dtypes @@ -106,6 +105,10 @@ def time_frame_op_with_series_axis0(self, opname): def time_frame_op_with_series_axis1(self, opname): getattr(operator, opname)(self.df, self.ser) + # exclude comparisons from the params for time_frame_op_with_series_axis1 + # since they do not do alignment so raise + time_frame_op_with_series_axis1.params = [params[0][6:]] + class FrameWithFrameWide: # Many-columns, mixed dtypes @@ -169,7 +172,6 @@ def time_op_same_blocks(self, op, shape): class Ops: - params = [[True, False], ["default", 1]] param_names = ["use_numexpr", "threads"] @@ -253,21 +255,24 @@ def time_frame_series_dot(self): class Timeseries: - params = [None, "US/Eastern"] param_names = ["tz"] def setup(self, tz): N = 10**6 halfway = (N // 2) - 1 - self.s = Series(date_range("20010101", periods=N, freq="T", tz=tz)) + self.s = Series(date_range("20010101", periods=N, freq="min", tz=tz)) self.ts = self.s[halfway] self.s2 = Series(date_range("20010101", periods=N, freq="s", tz=tz)) + self.ts_different_reso = Timestamp("2001-01-02", tz=tz) def time_series_timestamp_compare(self, tz): self.s <= self.ts + def time_series_timestamp_different_reso_compare(self, tz): + self.s <= self.ts_different_reso + def time_timestamp_series_compare(self, tz): self.ts >= self.s @@ -312,14 +317,15 @@ def time_categorical_op(self, op): class IndexArithmetic: - params = ["float", "int"] param_names = ["dtype"] def setup(self, dtype): N = 10**6 - indexes = {"int": "makeIntIndex", "float": "makeFloatIndex"} - self.index = getattr(tm, indexes[dtype])(N) + if dtype == "float": + self.index = Index(np.arange(N), dtype=np.float64) + elif dtype == "int": + self.index = Index(np.arange(N), dtype=np.int64) def time_add(self, dtype): self.index + 2 @@ -382,43 +388,6 @@ def time_add_timedeltas(self, df): df["timedelta"] + df["timedelta"] -class AddOverflowScalar: - - params = [1, -1, 0] - param_names = ["scalar"] - - def setup(self, scalar): - N = 10**6 - self.arr = np.arange(N) - - def time_add_overflow_scalar(self, scalar): - checked_add_with_arr(self.arr, scalar) - - -class AddOverflowArray: - def setup(self): - N = 10**6 - self.arr = np.arange(N) - self.arr_rev = np.arange(-N, 0) - self.arr_mixed = np.array([1, -1]).repeat(N / 2) - self.arr_nan_1 = np.random.choice([True, False], size=N) - self.arr_nan_2 = np.random.choice([True, False], size=N) - - def time_add_overflow_arr_rev(self): - checked_add_with_arr(self.arr, self.arr_rev) - - def time_add_overflow_arr_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) - - def time_add_overflow_b_mask_nan(self): - checked_add_with_arr(self.arr, self.arr_mixed, b_mask=self.arr_nan_1) - - def time_add_overflow_both_arg_nan(self): - checked_add_with_arr( - self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, b_mask=self.arr_nan_2 - ) - - hcal = pd.tseries.holiday.USFederalHolidayCalendar() # These offsets currently raise a NotImplementedError with .apply_index() non_apply = [ @@ -451,13 +420,12 @@ def time_add_overflow_both_arg_nan(self): class OffsetArrayArithmetic: - params = offsets param_names = ["offset"] def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng self.ser = Series(rng) @@ -476,7 +444,7 @@ class ApplyIndex: def setup(self, offset): N = 10000 - rng = date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="min") self.rng = rng def time_apply_index(self, offset): @@ -488,7 +456,7 @@ class BinaryOpsMultiIndex: param_names = ["func"] def setup(self, func): - array = date_range("20200101 00:00", "20200102 0:00", freq="S") + array = date_range("20200101 00:00", "20200102 0:00", freq="s") level_0_names = [str(i) for i in range(30)] index = pd.MultiIndex.from_product([level_0_names, array]) diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index b58200911749e..953af5c868356 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -2,8 +2,6 @@ import pandas as pd -from .pandas_vb_common import tm - class BooleanArray: def setup(self): @@ -32,9 +30,10 @@ def time_from_float_array(self): class IntegerArray: def setup(self): - self.values_integer = np.array([1, 0, 1, 0]) - self.data = np.array([1, 2, 3, 4], dtype="int64") - self.mask = np.array([False, False, True, False]) + N = 250_000 + self.values_integer = np.tile(np.array([1, 0, 1, 0]), N) + self.data = np.tile(np.array([1, 2, 3, 4], dtype="int64"), N) + self.mask = np.tile(np.array([False, False, True, False]), N) def time_constructor(self): pd.arrays.IntegerArray(self.data, self.mask) @@ -43,17 +42,43 @@ def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") -class ArrowStringArray: +class IntervalArray: + def setup(self): + N = 10_000 + self.tuples = [(i, i + 1) for i in range(N)] + + def time_from_tuples(self): + pd.arrays.IntervalArray.from_tuples(self.tuples) + + +class StringArray: + def setup(self): + N = 100_000 + values = np.array([str(i) for i in range(N)], dtype=object) + self.values_obj = np.array(values, dtype="object") + self.values_str = np.array(values, dtype="U") + self.values_list = values.tolist() + + def time_from_np_object_array(self): + pd.array(self.values_obj, dtype="string") + def time_from_np_str_array(self): + pd.array(self.values_str, dtype="string") + + def time_from_list(self): + pd.array(self.values_list, dtype="string") + + +class ArrowStringArray: params = [False, True] param_names = ["multiple_chunks"] def setup(self, multiple_chunks): try: import pyarrow as pa - except ImportError: - raise NotImplementedError - strings = tm.rands_array(3, 10_000) + except ImportError as err: + raise NotImplementedError from err + strings = np.array([str(i) for i in range(10_000)], dtype=object) if multiple_chunks: chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)] self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks)) @@ -65,8 +90,51 @@ def time_setitem(self, multiple_chunks): self.array[i] = "foo" def time_setitem_list(self, multiple_chunks): - indexer = list(range(0, 50)) + list(range(-50, 0)) + indexer = list(range(50)) + list(range(-1000, 0, 50)) self.array[indexer] = ["foo"] * len(indexer) def time_setitem_slice(self, multiple_chunks): self.array[::10] = "foo" + + def time_setitem_null_slice(self, multiple_chunks): + self.array[:] = "foo" + + def time_tolist(self, multiple_chunks): + self.array.tolist() + + +class ArrowExtensionArray: + params = [ + [ + "boolean[pyarrow]", + "float64[pyarrow]", + "int64[pyarrow]", + "string[pyarrow]", + "timestamp[ns][pyarrow]", + ], + [False, True], + ] + param_names = ["dtype", "hasna"] + + def setup(self, dtype, hasna): + N = 100_000 + if dtype == "boolean[pyarrow]": + data = np.random.choice([True, False], N, replace=True) + elif dtype == "float64[pyarrow]": + data = np.random.randn(N) + elif dtype == "int64[pyarrow]": + data = np.arange(N) + elif dtype == "string[pyarrow]": + data = np.array([str(i) for i in range(N)], dtype=object) + elif dtype == "timestamp[ns][pyarrow]": + data = pd.date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + arr = pd.array(data, dtype=dtype) + if hasna: + arr[::2] = pd.NA + self.arr = arr + + def time_to_numpy(self, dtype, hasna): + self.arr.to_numpy() diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index d4366c42f96aa..2a004113d1b91 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -15,14 +15,13 @@ def setup(self): self.cur_index = self.df.index def time_get_index(self): - self.foo = self.df.index + self.df.index def time_set_index(self): self.df.index = self.cur_index class SeriesArrayAttribute: - params = [["numeric", "object", "category", "datetime64", "datetime64tz"]] param_names = ["dtype"] diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index ff0b3b2fb651d..7d5b250c7b157 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -6,8 +6,6 @@ import pandas as pd -from .pandas_vb_common import tm - try: from pandas.api.types import union_categoricals except ImportError: @@ -26,7 +24,7 @@ def setup(self): self.codes = np.tile(range(len(self.categories)), N) self.datetimes = pd.Series( - pd.date_range("1995-01-01 00:00:00", periods=N / 10, freq="s") + pd.date_range("1995-01-01 00:00:00", periods=N // 10, freq="s") ) self.datetimes_with_nat = self.datetimes.copy() self.datetimes_with_nat.iloc[-1] = pd.NaT @@ -42,7 +40,8 @@ def time_regular(self): pd.Categorical(self.values, self.categories) def time_fastpath(self): - pd.Categorical(self.codes, self.cat_idx, fastpath=True) + dtype = pd.CategoricalDtype(categories=self.cat_idx) + pd.Categorical._simple_new(self.codes, dtype) def time_datetimes(self): pd.Categorical(self.datetimes) @@ -89,7 +88,7 @@ def setup(self): ) for col in ("int", "float", "timestamp"): - self.df[col + "_as_str"] = self.df[col].astype(str) + self.df[f"{col}_as_str"] = self.df[col].astype(str) for col in self.df.columns: self.df[col] = self.df[col].astype("category") @@ -143,7 +142,6 @@ def time_concat_non_overlapping_index(self): class ValueCounts: - params = [True, False] param_names = ["dropna"] @@ -189,7 +187,7 @@ def setup(self): N = 10**5 ncats = 15 - self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str = pd.Series(np.random.randint(0, ncats, size=N).astype(str)) self.s_str_cat = pd.Series(self.s_str, dtype="category") with warnings.catch_warnings(record=True): str_cat_type = pd.CategoricalDtype(set(self.s_str), ordered=True) @@ -242,7 +240,7 @@ def time_categorical_series_is_monotonic_decreasing(self): class Contains: def setup(self): N = 10**5 - self.ci = tm.makeCategoricalIndex(N) + self.ci = pd.CategoricalIndex(np.arange(N)) self.c = self.ci.values self.key = self.ci.categories[0] @@ -254,25 +252,22 @@ def time_categorical_contains(self): class CategoricalSlicing: - params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] param_names = ["index"] def setup(self, index): N = 10**6 categories = ["a", "b", "c"] - values = [0] * N + [1] * N + [2] * N if index == "monotonic_incr": - self.data = pd.Categorical.from_codes(values, categories=categories) + codes = np.repeat([0, 1, 2], N) elif index == "monotonic_decr": - self.data = pd.Categorical.from_codes( - list(reversed(values)), categories=categories - ) + codes = np.repeat([2, 1, 0], N) elif index == "non_monotonic": - self.data = pd.Categorical.from_codes([0, 1, 2] * N, categories=categories) + codes = np.tile([0, 1, 2], N) else: raise ValueError(f"Invalid index param: {index}") + self.data = pd.Categorical.from_codes(codes, categories=categories) self.scalar = 10000 self.list = list(range(10000)) self.cat_scalar = "b" @@ -328,7 +323,7 @@ def time_sort_values(self): class SearchSorted: def setup(self): N = 10**5 - self.ci = tm.makeCategoricalIndex(N).sort_values() + self.ci = pd.CategoricalIndex(np.arange(N)).sort_values() self.c = self.ci.values self.key = self.ci.categories[1] diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index ef8b16f376d6a..77c9faf3d3a87 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -6,10 +6,9 @@ MultiIndex, Series, Timestamp, + date_range, ) -from .pandas_vb_common import tm - def no_change(arr): return arr @@ -48,7 +47,6 @@ def list_of_lists_with_none(arr): class SeriesConstructors: - param_names = ["data_fmt", "with_index", "dtype"] params = [ [ @@ -115,10 +113,33 @@ def time_dtindex_from_index_with_series(self): class MultiIndexConstructor: def setup(self): N = 10**4 - self.iterables = [tm.makeStringIndex(N), range(20)] + self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)] def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) +class DatetimeIndexConstructor: + def setup(self): + N = 20_000 + dti = date_range("1900-01-01", periods=N) + + self.list_of_timestamps = dti.tolist() + self.list_of_dates = dti.date.tolist() + self.list_of_datetimes = dti.to_pydatetime().tolist() + self.list_of_str = dti.strftime("%Y-%m-%d").tolist() + + def time_from_list_of_timestamps(self): + DatetimeIndex(self.list_of_timestamps) + + def time_from_list_of_dates(self): + DatetimeIndex(self.list_of_dates) + + def time_from_list_of_datetimes(self): + DatetimeIndex(self.list_of_datetimes) + + def time_from_list_of_str(self): + DatetimeIndex(self.list_of_str) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 55f6be848aa13..7f3429b5e3882 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -3,7 +3,10 @@ import numpy as np import pandas as pd -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm from pandas.api.types import ( is_extension_array_dtype, @@ -24,7 +27,7 @@ class Dtypes: - params = _dtypes + list(map(lambda dt: dt.name, _dtypes)) + params = _dtypes + [dt.name for dt in _dtypes] param_names = ["dtype"] def time_pandas_dtype(self, dtype): @@ -49,7 +52,6 @@ def time_pandas_dtype_invalid(self, dtype): class SelectDtypes: - try: params = [ tm.ALL_INT_NUMPY_DTYPES @@ -74,8 +76,8 @@ class SelectDtypes: def setup(self, dtype): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = Index([f"i-{i}" for i in range(K)], dtype=object) def create_df(data): return DataFrame(data, index=self.index, columns=self.columns) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index b5442531e748a..656d16a910a9f 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -9,7 +9,6 @@ class Eval: - params = [["numexpr", "python"], [1, "all"]] param_names = ["engine", "threads"] @@ -45,7 +44,7 @@ class Query: def setup(self): N = 10**6 halfway = (N // 2) - 1 - index = pd.date_range("20010101", periods=N, freq="T") + index = pd.date_range("20010101", periods=N, freq="min") s = pd.Series(index) self.ts = s.iloc[halfway] self.df = pd.DataFrame({"a": np.random.randn(N), "dates": index}, index=index) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 20c0c0ea2f6fe..f938f7eb0d951 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -12,8 +12,6 @@ date_range, ) -from .pandas_vb_common import tm - try: from pandas.tseries.offsets import ( Hour, @@ -30,8 +28,8 @@ class FromDicts: def setup(self): N, K = 5000, 50 - self.index = tm.makeStringIndex(N) - self.columns = tm.makeStringIndex(K) + self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object) + self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object) frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) self.data = frame.to_dict() self.dict_list = frame.to_dict(orient="records") @@ -74,7 +72,6 @@ def time_mi_series(self): class FromDictwithTimestamp: - params = [Nano(1), Hour(1)] param_names = ["offset"] @@ -89,7 +86,6 @@ def time_dict_with_timestamp_offsets(self, offset): class FromRecords: - params = [None, 1000] param_names = ["nrows"] @@ -116,7 +112,6 @@ def time_frame_from_ndarray(self): class FromLists: - goal_time = 0.2 def setup(self): @@ -129,7 +124,6 @@ def time_frame_from_lists(self): class FromRange: - goal_time = 0.2 def setup(self): @@ -162,7 +156,6 @@ def time_frame_from_scalar_ea_float64_na(self): class FromArrays: - goal_time = 0.2 def setup(self): @@ -205,21 +198,4 @@ def time_frame_from_arrays_sparse(self): ) -class From3rdParty: - # GH#44616 - - def setup(self): - try: - import torch - except ImportError: - raise NotImplementedError - - row = 700000 - col = 64 - self.val_tensor = torch.randn(row, col) - - def time_from_torch(self): - DataFrame(self.val_tensor) - - from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index a28e20a636ce2..6a2ab24df26fe 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, MultiIndex, NaT, Series, @@ -14,7 +15,54 @@ timedelta_range, ) -from .pandas_vb_common import tm + +class AsType: + params = [ + [ + # from_dtype == to_dtype + ("Float64", "Float64"), + ("float64[pyarrow]", "float64[pyarrow]"), + # from non-EA to EA + ("float64", "Float64"), + ("float64", "float64[pyarrow]"), + # from EA to non-EA + ("Float64", "float64"), + ("float64[pyarrow]", "float64"), + # from EA to EA + ("Int64", "Float64"), + ("int64[pyarrow]", "float64[pyarrow]"), + ], + [False, True], + ] + param_names = ["from_to_dtypes", "copy"] + + def setup(self, from_to_dtypes, copy): + from_dtype = from_to_dtypes[0] + if from_dtype in ("float64", "Float64", "float64[pyarrow]"): + data = np.random.randn(100, 100) + elif from_dtype in ("int64", "Int64", "int64[pyarrow]"): + data = np.random.randint(0, 1000, (100, 100)) + else: + raise NotImplementedError + self.df = DataFrame(data, dtype=from_dtype) + + def time_astype(self, from_to_dtypes, copy): + self.df.astype(from_to_dtypes[1], copy=copy) + + +class Clip: + params = [ + ["float64", "Float64", "float64[pyarrow]"], + ] + param_names = ["dtype"] + + def setup(self, dtype): + data = np.random.randn(100_000, 10) + df = DataFrame(data, dtype=dtype) + self.df = df + + def time_clip(self, dtype): + self.df.clip(-1.0, 1.0) class GetNumericData: @@ -28,26 +76,6 @@ def time_frame_get_numeric_data(self): self.df._get_numeric_data() -class Lookup: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list("abcdefgh")) - self.df["foo"] = "bar" - self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = list(self.df.columns) * 100 - self.row_labels_all = np.array( - list(self.df.index) * len(self.df.columns), dtype="object" - ) - self.col_labels_all = np.array( - list(self.df.columns) * len(self.df.index), dtype="object" - ) - - def time_frame_fancy_lookup(self): - self.df.lookup(self.row_labels, self.col_labels) - - def time_frame_fancy_lookup_all(self): - self.df.lookup(self.row_labels_all, self.col_labels_all) - - class Reindex: def setup(self): N = 10**3 @@ -131,12 +159,6 @@ def setup(self): def time_items(self): # (monitor no-copying behaviour) - if hasattr(self.df, "_item_cache"): - self.df._item_cache.clear() - for name, col in self.df.items(): - pass - - def time_items_cached(self): for name, col in self.df.items(): pass @@ -391,10 +413,8 @@ def time_isnull_obj(self): class Fillna: - params = ( [True, False], - ["pad", "bfill"], [ "float64", "float32", @@ -406,15 +426,15 @@ class Fillna: "timedelta64[ns]", ], ) - param_names = ["inplace", "method", "dtype"] + param_names = ["inplace", "dtype"] - def setup(self, inplace, method, dtype): + def setup(self, inplace, dtype): N, M = 10000, 100 if dtype in ("datetime64[ns]", "datetime64[ns, tz]", "timedelta64[ns]"): data = { - "datetime64[ns]": date_range("2011-01-01", freq="H", periods=N), + "datetime64[ns]": date_range("2011-01-01", freq="h", periods=N), "datetime64[ns, tz]": date_range( - "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" + "2011-01-01", freq="h", periods=N, tz="Asia/Tokyo" ), "timedelta64[ns]": timedelta_range(start="1 day", periods=N, freq="1D"), } @@ -426,13 +446,19 @@ def setup(self, inplace, method, dtype): if dtype == "Int64": values = values.round() self.df = DataFrame(values, dtype=dtype) + self.fill_values = self.df.iloc[self.df.first_valid_index()].to_dict() - def time_frame_fillna(self, inplace, method, dtype): - self.df.fillna(inplace=inplace, method=method) + def time_fillna(self, inplace, dtype): + self.df.fillna(value=self.fill_values, inplace=inplace) + def time_ffill(self, inplace, dtype): + self.df.ffill(inplace=inplace) -class Dropna: + def time_bfill(self, inplace, dtype): + self.df.bfill(inplace=inplace) + +class Dropna: params = (["all", "any"], [0, 1]) param_names = ["how", "axis"] @@ -451,8 +477,23 @@ def time_dropna_axis_mixed_dtypes(self, how, axis): self.df_mixed.dropna(how=how, axis=axis) -class Count: +class Isna: + params = ["float64", "Float64", "float64[pyarrow]"] + param_names = ["dtype"] + + def setup(self, dtype): + data = np.random.randn(10000, 1000) + # all-na columns + data[:, 600:800] = np.nan + # partial-na columns + data[800:1000, 4000:5000] = np.nan + self.df = DataFrame(data, dtype=dtype) + + def time_isna(self, dtype): + self.df.isna() + +class Count: params = [0, 1] param_names = ["axis"] @@ -464,20 +505,11 @@ def setup(self, axis): self.df_mixed = self.df.copy() self.df_mixed["foo"] = "bar" - self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) - self.df.columns = MultiIndex.from_arrays([self.df.columns, self.df.columns]) - self.df_mixed.index = MultiIndex.from_arrays( - [self.df_mixed.index, self.df_mixed.index] - ) - self.df_mixed.columns = MultiIndex.from_arrays( - [self.df_mixed.columns, self.df_mixed.columns] - ) + def time_count(self, axis): + self.df.count(axis=axis) - def time_count_level_multi(self, axis): - self.df.count(axis=axis, level=1) - - def time_count_level_mixed_dtypes_multi(self, axis): - self.df_mixed.count(axis=axis, level=1) + def time_count_mixed_dtypes(self, axis): + self.df_mixed.count(axis=axis) class Apply: @@ -497,8 +529,8 @@ def time_apply_axis_1(self): def time_apply_lambda_mean(self): self.df.apply(lambda x: x.mean()) - def time_apply_np_mean(self): - self.df.apply(np.mean) + def time_apply_str_mean(self): + self.df.apply("mean") def time_apply_pass_thru(self): self.df.apply(lambda x: x) @@ -551,16 +583,10 @@ def time_frame_object_unequal(self): class Interpolate: - - params = [None, "infer"] - param_names = ["downcast"] - - def setup(self, downcast): + def setup(self): N = 10000 # this is the worst case, where every column has NaNs. arr = np.random.randn(N, 100) - # NB: we need to set values in array, not in df.values, otherwise - # the benchmark will be misleading for ArrayManager arr[::2] = np.nan self.df = DataFrame(arr) @@ -576,11 +602,11 @@ def setup(self, downcast): self.df2.loc[1::5, "A"] = np.nan self.df2.loc[1::5, "C"] = np.nan - def time_interpolate(self, downcast): - self.df.interpolate(downcast=downcast) + def time_interpolate(self): + self.df.interpolate() - def time_interpolate_some_good(self, downcast): - self.df2.interpolate(downcast=downcast) + def time_interpolate_some_good(self): + self.df2.interpolate() class Shift: @@ -605,7 +631,8 @@ def time_frame_nunique(self): class SeriesNuniqueWithNan: def setup(self): - self.ser = Series(100000 * (100 * [np.nan] + list(range(100)))).astype(float) + values = 100 * [np.nan] + list(range(100)) + self.ser = Series(np.tile(values, 10000), dtype=float) def time_series_nunique_nan(self): self.ser.nunique() @@ -614,7 +641,7 @@ def time_series_nunique_nan(self): class Duplicated: def setup(self): n = 1 << 20 - t = date_range("2015-01-01", freq="S", periods=(n // 64)) + t = date_range("2015-01-01", freq="s", periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame( { @@ -636,7 +663,6 @@ def time_frame_duplicated_subset(self): class XS: - params = [0, 1] param_names = ["axis"] @@ -649,7 +675,6 @@ def time_frame_xs(self, axis): class SortValues: - params = [True, False] param_names = ["ascending"] @@ -660,24 +685,37 @@ def time_frame_sort_values(self, ascending): self.df.sort_values(by="A", ascending=ascending) -class SortIndexByColumns: - def setup(self): +class SortMultiKey: + params = [True, False] + param_names = ["monotonic"] + + def setup(self, monotonic): N = 10000 K = 10 - self.df = DataFrame( + df = DataFrame( { - "key1": tm.makeStringIndex(N).values.repeat(K), - "key2": tm.makeStringIndex(N).values.repeat(K), + "key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), + "key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat( + K + ), "value": np.random.randn(N * K), } ) + if monotonic: + df = df.sort_values(["key1", "key2"]) + self.df_by_columns = df + self.df_by_index = df.set_index(["key1", "key2"]) - def time_frame_sort_values_by_columns(self): - self.df.sort_values(by=["key1", "key2"]) + def time_sort_values(self, monotonic): + self.df_by_columns.sort_values(by=["key1", "key2"]) + def time_sort_index(self, monotonic): + self.df_by_index.sort_index() -class Quantile: +class Quantile: params = [0, 1] param_names = ["axis"] @@ -717,7 +755,6 @@ def time_info(self): class NSort: - params = ["first", "last", "all"] param_names = ["keep"] @@ -767,4 +804,86 @@ def time_memory_usage_object_dtype(self): self.df2.memory_usage(deep=True) +class Round: + def setup(self): + self.df = DataFrame(np.random.randn(10000, 10)) + self.df_t = self.df.transpose(copy=True) + + def time_round(self): + self.df.round() + + def time_round_transposed(self): + self.df_t.round() + + def peakmem_round(self): + self.df.round() + + def peakmem_round_transposed(self): + self.df_t.round() + + +class Where: + params = ( + [True, False], + ["float64", "Float64", "float64[pyarrow]"], + ) + param_names = ["dtype"] + + def setup(self, inplace, dtype): + self.df = DataFrame(np.random.randn(100_000, 10), dtype=dtype) + self.mask = self.df < 0 + + def time_where(self, inplace, dtype): + self.df.where(self.mask, other=0.0, inplace=inplace) + + +class FindValidIndex: + param_names = ["dtype"] + params = [ + ["float", "Float64", "float64[pyarrow]"], + ] + + def setup(self, dtype): + df = DataFrame( + np.random.randn(100000, 2), + columns=list("AB"), + dtype=dtype, + ) + df.iloc[:100, 0] = None + df.iloc[:200, 1] = None + df.iloc[-100:, 0] = None + df.iloc[-200:, 1] = None + self.df = df + + def time_first_valid_index(self, dtype): + self.df.first_valid_index() + + def time_last_valid_index(self, dtype): + self.df.last_valid_index() + + +class Update: + def setup(self): + rng = np.random.default_rng() + self.df = DataFrame(rng.uniform(size=(1_000_000, 10))) + + idx = rng.choice(range(1_000_000), size=1_000_000, replace=False) + self.df_random = DataFrame(self.df, index=idx) + + idx = rng.choice(range(1_000_000), size=100_000, replace=False) + cols = rng.choice(range(10), size=2, replace=False) + self.df_sample = DataFrame( + rng.uniform(size=(100_000, 2)), index=idx, columns=cols + ) + + def time_to_update_big_frame_small_arg(self): + self.df.update(self.df_sample) + + def time_to_update_random_indices(self): + self.df_random.update(self.df_sample) + + def time_to_update_small_frame_big_arg(self): + self.df_sample.update(self.df) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 31654a5c75617..a0c4189c72d0e 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, Series, date_range, factorize, @@ -12,8 +13,6 @@ ) from pandas.core.algorithms import take_nd -from .pandas_vb_common import tm - try: from pandas import ( rolling_kurt, @@ -34,7 +33,6 @@ except ImportError: from pandas import algos - from .pandas_vb_common import BaseIO # isort:skip @@ -87,12 +85,10 @@ def inner(*args, **kwargs): class ParallelGroupbyMethods: - params = ([2, 4, 8], ["count", "last", "max", "mean", "min", "prod", "sum", "var"]) param_names = ["threads", "method"] def setup(self, threads, method): - N = 10**6 ngroups = 10**3 df = DataFrame( @@ -119,12 +115,10 @@ def time_loop(self, threads, method): class ParallelGroups: - params = [2, 4, 8] param_names = ["threads"] def setup(self, threads): - size = 2**22 ngroups = 10**3 data = Series(np.random.randint(0, ngroups, size=size)) @@ -140,12 +134,10 @@ def time_get_groups(self, threads): class ParallelTake1D: - params = ["int64", "float64"] param_names = ["dtype"] def setup(self, dtype): - N = 10**6 df = DataFrame({"col": np.arange(N, dtype=dtype)}) indexer = np.arange(100, len(df) - 100) @@ -167,7 +159,6 @@ class ParallelKth: repeat = 5 def setup(self): - N = 10**7 k = 5 * 10**5 kwargs_list = [{"arr": np.random.randn(N)}, {"arr": np.random.randn(N)}] @@ -184,9 +175,8 @@ def time_kth_smallest(self): class ParallelDatetimeFields: def setup(self): - N = 10**6 - self.dti = date_range("1900-01-01", periods=N, freq="T") + self.dti = date_range("1900-01-01", periods=N, freq="min") self.period = self.dti.to_period("D") def time_datetime_field_year(self): @@ -220,7 +210,7 @@ def run(dti): def time_datetime_to_period(self): @test_parallel(num_threads=2) def run(dti): - dti.to_period("S") + dti.to_period("s") run(self.dti) @@ -233,12 +223,10 @@ def run(period): class ParallelRolling: - params = ["median", "mean", "min", "max", "var", "skew", "kurt", "std"] param_names = ["method"] def setup(self, method): - win = 100 arr = np.random.rand(100000) if hasattr(DataFrame, "rolling"): @@ -274,28 +262,28 @@ def time_rolling(self, method): class ParallelReadCSV(BaseIO): - number = 1 repeat = 5 params = ["float", "object", "datetime"] param_names = ["dtype"] def setup(self, dtype): - rows = 10000 cols = 50 - data = { - "float": DataFrame(np.random.randn(rows, cols)), - "datetime": DataFrame( + if dtype == "float": + df = DataFrame(np.random.randn(rows, cols)) + elif dtype == "datetime": + df = DataFrame( np.random.randn(rows, cols), index=date_range("1/1/2000", periods=rows) - ), - "object": DataFrame( + ) + elif dtype == "object": + df = DataFrame( "foo", index=range(rows), columns=["object%03d" for _ in range(5)] - ), - } + ) + else: + raise NotImplementedError self.fname = f"__test_{dtype}__.csv" - df = data[dtype] df.to_csv(self.fname) @test_parallel(num_threads=2) @@ -309,15 +297,13 @@ def time_read_csv(self, dtype): class ParallelFactorize: - number = 1 repeat = 5 params = [2, 4, 8] param_names = ["threads"] def setup(self, threads): - - strings = tm.makeStringIndex(100000) + strings = Index([f"i-{i}" for i in range(100000)], dtype=object) @test_parallel(num_threads=threads) def parallel(): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 2de1f25fceace..19c556dfe9d1f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -5,6 +5,7 @@ import numpy as np from pandas import ( + NA, Categorical, DataFrame, Index, @@ -13,10 +14,9 @@ Timestamp, date_range, period_range, + to_timedelta, ) -from .pandas_vb_common import tm - method_blocklist = { "object": { "diff", @@ -34,7 +34,6 @@ "pct_change", "min", "var", - "mad", "describe", "std", "quantile", @@ -51,12 +50,45 @@ "cummax", "pct_change", "var", - "mad", "describe", "std", }, } +# These aggregations don't have a kernel implemented for them yet +_numba_unsupported_methods = [ + "all", + "any", + "bfill", + "count", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "describe", + "diff", + "ffill", + "first", + "head", + "idxmax", + "idxmin", + "last", + "median", + "nunique", + "pct_change", + "prod", + "quantile", + "rank", + "sem", + "shift", + "size", + "skew", + "tail", + "unique", + "value_counts", +] + class ApplyDictReturn: def setup(self): @@ -70,7 +102,6 @@ def time_groupby_apply_dict_return(self): class Apply: - param_names = ["factor"] params = [4, 5] @@ -125,7 +156,6 @@ def time_groupby_apply_non_unique_unsorted_index(self): class Groups: - param_names = ["key"] params = ["int64_small", "int64_large", "object_small", "object_large"] @@ -135,10 +165,14 @@ def setup_cache(self): "int64_small": Series(np.random.randint(0, 100, size=size)), "int64_large": Series(np.random.randint(0, 10000, size=size)), "object_small": Series( - tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size)) + Index([f"i-{i}" for i in range(100)], dtype=object).take( + np.random.randint(0, 100, size=size) + ) ), "object_large": Series( - tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size)) + Index([f"i-{i}" for i in range(10000)], dtype=object).take( + np.random.randint(0, 10000, size=size) + ) ), } return data @@ -154,7 +188,6 @@ def time_series_indices(self, data, key): class GroupManyLabels: - params = [1, 1000] param_names = ["ncols"] @@ -169,7 +202,6 @@ def time_sum(self, ncols): class Nth: - param_names = ["dtype"] params = ["float32", "float64", "datetime", "object"] @@ -208,7 +240,7 @@ def time_series_nth(self, dtype): class DateAttributes: def setup(self): - rng = date_range("1/1/2000", "12/31/2005", freq="H") + rng = date_range("1/1/2000", "12/31/2005", freq="h") self.year, self.month, self.day = rng.year, rng.month, rng.day self.ts = Series(np.random.randn(len(rng)), index=rng) @@ -301,16 +333,11 @@ def time_different_str_functions(self, df): {"value1": "mean", "value2": "var", "value3": "sum"} ) - def time_different_numpy_functions(self, df): - df.groupby(["key1", "key2"]).agg( - {"value1": np.mean, "value2": np.var, "value3": np.sum} - ) + def time_different_str_functions_multicol(self, df): + df.groupby(["key1", "key2"]).agg(["sum", "min", "max"]) - def time_different_python_functions_multicol(self, df): - df.groupby(["key1", "key2"]).agg([sum, min, max]) - - def time_different_python_functions_singlecol(self, df): - df.groupby("key1").agg([sum, min, max]) + def time_different_str_functions_singlecol(self, df): + df.groupby("key1").agg({"value1": "mean", "value2": "var", "value3": "sum"}) class GroupStrings: @@ -353,8 +380,8 @@ def time_cython_sum(self, df): def time_col_select_lambda_sum(self, df): df.groupby(["key1", "key2"])["data1"].agg(lambda x: x.values.sum()) - def time_col_select_numpy_sum(self, df): - df.groupby(["key1", "key2"])["data1"].agg(np.sum) + def time_col_select_str_sum(self, df): + df.groupby(["key1", "key2"])["data1"].agg("sum") class Size: @@ -380,7 +407,7 @@ def time_multi_size(self): self.df.groupby(["key1", "key2"]).size() def time_category_size(self): - self.draws.groupby(self.cats).size() + self.draws.groupby(self.cats, observed=True).size() class Shift: @@ -395,7 +422,7 @@ def time_fill_value(self): self.df.groupby("g").shift(fill_value=99) -class FillNA: +class Fillna: def setup(self): N = 100 self.df = DataFrame( @@ -403,20 +430,19 @@ def setup(self): ).set_index("group") def time_df_ffill(self): - self.df.groupby("group").fillna(method="ffill") + self.df.groupby("group").ffill() def time_df_bfill(self): - self.df.groupby("group").fillna(method="bfill") + self.df.groupby("group").bfill() def time_srs_ffill(self): - self.df.groupby("group")["value"].fillna(method="ffill") + self.df.groupby("group")["value"].ffill() def time_srs_bfill(self): - self.df.groupby("group")["value"].fillna(method="bfill") + self.df.groupby("group")["value"].bfill() class GroupByMethods: - param_names = ["dtype", "method", "application", "ncols"] params = [ ["int", "int16", "float", "object", "datetime", "uint"], @@ -436,7 +462,6 @@ class GroupByMethods: "first", "head", "last", - "mad", "max", "min", "median", @@ -459,9 +484,10 @@ class GroupByMethods: ], ["direct", "transformation"], [1, 5], + ["cython", "numba"], ] - def setup(self, dtype, method, application, ncols): + def setup(self, dtype, method, application, ncols, engine): if method in method_blocklist.get(dtype, {}): raise NotImplementedError # skip benchmark @@ -480,9 +506,21 @@ def setup(self, dtype, method, application, ncols): # DataFrameGroupBy doesn't have these methods raise NotImplementedError + # Numba currently doesn't support + # multiple transform functions or strs for transform, + # grouping on multiple columns + # and we lack kernels for a bunch of methods + if ( + (engine == "numba" and method in _numba_unsupported_methods) + or ncols > 1 + or application == "transformation" + or dtype == "datetime" + ): + raise NotImplementedError + if method == "describe": ngroups = 20 - elif method in ["mad", "skew"]: + elif method == "skew": ngroups = 100 else: ngroups = 1000 @@ -511,17 +549,30 @@ def setup(self, dtype, method, application, ncols): if len(cols) == 1: cols = cols[0] + # Not everything supports the engine keyword yet + kwargs = {} + if engine == "numba": + kwargs["engine"] = engine + if application == "transformation": - self.as_group_method = lambda: df.groupby("key")[cols].transform(method) - self.as_field_method = lambda: df.groupby(cols)["key"].transform(method) + self.as_group_method = lambda: df.groupby("key")[cols].transform( + method, **kwargs + ) + self.as_field_method = lambda: df.groupby(cols)["key"].transform( + method, **kwargs + ) else: - self.as_group_method = getattr(df.groupby("key")[cols], method) - self.as_field_method = getattr(df.groupby(cols)["key"], method) + self.as_group_method = partial( + getattr(df.groupby("key")[cols], method), **kwargs + ) + self.as_field_method = partial( + getattr(df.groupby(cols)["key"], method), **kwargs + ) - def time_dtype_as_group(self, dtype, method, application, ncols): + def time_dtype_as_group(self, dtype, method, application, ncols, engine): self.as_group_method() - def time_dtype_as_field(self, dtype, method, application, ncols): + def time_dtype_as_field(self, dtype, method, application, ncols, engine): self.as_field_method() @@ -540,6 +591,8 @@ class GroupByCythonAgg: "prod", "min", "max", + "idxmin", + "idxmax", "mean", "median", "var", @@ -560,32 +613,92 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) -class Cumulative: +class GroupByNumbaAgg(GroupByCythonAgg): + """ + Benchmarks specifically targeting our numba aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + def setup(self, dtype, method): + if method in _numba_unsupported_methods: + raise NotImplementedError + super().setup(dtype, method) + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method, engine="numba") + + +class GroupByCythonAggEaDtypes: + """ + Benchmarks specifically targeting our cython aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + param_names = ["dtype", "method"] + params = [ + ["Float64", "Int64", "Int32"], + [ + "sum", + "prod", + "min", + "max", + "mean", + "median", + "var", + "first", + "last", + "any", + "all", + ], + ] + + def setup(self, dtype, method): + N = 1_000_000 + df = DataFrame( + np.random.randint(0, high=100, size=(N, 10)), + columns=list("abcdefghij"), + dtype=dtype, + ) + df.loc[list(range(1, N, 5)), list("abcdefghij")] = NA + df["key"] = np.random.randint(0, 100, size=N) + self.df = df + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method) + + +class Cumulative: + param_names = ["dtype", "method", "with_nans"] params = [ ["float64", "int64", "Float64", "Int64"], ["cummin", "cummax", "cumsum"], + [True, False], ] - def setup(self, dtype, method): + def setup(self, dtype, method, with_nans): + if with_nans and dtype == "int64": + raise NotImplementedError("Construction of df would raise") + N = 500_000 - vals = np.random.randint(-10, 10, (N, 5)) - null_vals = vals.astype(float, copy=True) - null_vals[::2, :] = np.nan - null_vals[::3, :] = np.nan - df = DataFrame(vals, columns=list("abcde"), dtype=dtype) - null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) keys = np.random.randint(0, 100, size=N) - df["key"] = keys - null_df["key"] = keys - self.df = df - self.null_df = null_df + vals = np.random.randint(-10, 10, (N, 5)) - def time_frame_transform(self, dtype, method): - self.df.groupby("key").transform(method) + if with_nans: + null_vals = vals.astype(float, copy=True) + null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan + df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + df["key"] = keys + self.df = df + else: + df = DataFrame(vals, columns=list("abcde")).astype(dtype, copy=False) + df["key"] = keys + self.df = df - def time_frame_transform_many_nulls(self, dtype, method): - self.null_df.groupby("key").transform(method) + def time_frame_transform(self, dtype, method, with_nans): + self.df.groupby("key").transform(method) class RankWithTies: @@ -601,7 +714,7 @@ def setup(self, dtype, tie_method): if dtype == "datetime64": data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) else: - data = np.array([1] * N, dtype=dtype) + data = np.ones(N, dtype=dtype) self.df = DataFrame({"values": data, "key": ["foo"] * N}) def time_rank_ties(self, dtype, tie_method): @@ -628,12 +741,8 @@ class String: ["str", "string[python]"], [ "sum", - "prod", "min", "max", - "mean", - "median", - "var", "first", "last", "any", @@ -644,7 +753,7 @@ class String: def setup(self, dtype, method): cols = list("abcdefghjkl") self.df = DataFrame( - np.random.randint(0, 100, size=(1_000_000, len(cols))), + np.random.randint(0, 100, size=(10_000, len(cols))), columns=cols, dtype=dtype, ) @@ -654,7 +763,10 @@ def time_str_func(self, dtype, method): class Categories: - def setup(self): + params = [True, False] + param_names = ["observed"] + + def setup(self, observed): N = 10**5 arr = np.random.random(N) data = {"a": Categorical(np.random.randint(10000, size=N)), "b": arr} @@ -672,23 +784,68 @@ def setup(self): } self.df_extra_cat = DataFrame(data) + def time_groupby_sort(self, observed): + self.df.groupby("a", observed=observed)["b"].count() + + def time_groupby_nosort(self, observed): + self.df.groupby("a", observed=observed, sort=False)["b"].count() + + def time_groupby_ordered_sort(self, observed): + self.df_ordered.groupby("a", observed=observed)["b"].count() + + def time_groupby_ordered_nosort(self, observed): + self.df_ordered.groupby("a", observed=observed, sort=False)["b"].count() + + def time_groupby_extra_cat_sort(self, observed): + self.df_extra_cat.groupby("a", observed=observed)["b"].count() + + def time_groupby_extra_cat_nosort(self, observed): + self.df_extra_cat.groupby("a", observed=observed, sort=False)["b"].count() + + +class MultipleCategories: + def setup(self): + N = 10**3 + arr = np.random.random(N) + data = { + "a1": Categorical(np.random.randint(10000, size=N)), + "a2": Categorical(np.random.randint(10000, size=N)), + "b": arr, + } + self.df = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(10000, size=N), ordered=True), + "a2": Categorical(np.random.randint(10000, size=N), ordered=True), + "b": arr, + } + self.df_ordered = DataFrame(data) + data = { + "a1": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "a2": Categorical(np.random.randint(100, size=N), categories=np.arange(N)), + "b": arr, + } + self.df_extra_cat = DataFrame(data) + def time_groupby_sort(self): - self.df.groupby("a")["b"].count() + self.df.groupby(["a1", "a2"], observed=False)["b"].count() def time_groupby_nosort(self): - self.df.groupby("a", sort=False)["b"].count() + self.df.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() def time_groupby_ordered_sort(self): - self.df_ordered.groupby("a")["b"].count() + self.df_ordered.groupby(["a1", "a2"], observed=False)["b"].count() def time_groupby_ordered_nosort(self): - self.df_ordered.groupby("a", sort=False)["b"].count() + self.df_ordered.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() def time_groupby_extra_cat_sort(self): - self.df_extra_cat.groupby("a")["b"].count() + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].count() def time_groupby_extra_cat_nosort(self): - self.df_extra_cat.groupby("a", sort=False)["b"].count() + self.df_extra_cat.groupby(["a1", "a2"], observed=False, sort=False)["b"].count() + + def time_groupby_transform(self): + self.df_extra_cat.groupby(["a1", "a2"], observed=False)["b"].cumsum() class Datelike: @@ -734,12 +891,29 @@ def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() +class SumTimeDelta: + # GH 20660 + def setup(self): + N = 10**4 + self.df = DataFrame( + np.random.randint(1000, 100000, (N, 100)), + index=np.random.randint(200, size=(N,)), + ).astype("timedelta64[ns]") + self.df_int = self.df.copy().astype("int64") + + def time_groupby_sum_timedelta(self): + self.df.groupby(lambda x: x).sum() + + def time_groupby_sum_int(self): + self.df_int.groupby(lambda x: x).sum() + + class Transform: def setup(self): n1 = 400 n2 = 250 index = MultiIndex( - levels=[np.arange(n1), tm.makeStringIndex(n2)], + levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)], codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1], names=["lev1", "lev2"], ) @@ -778,8 +952,8 @@ def setup(self): def time_transform_lambda_max(self): self.df.groupby(level="lev1").transform(lambda x: max(x)) - def time_transform_ufunc_max(self): - self.df.groupby(level="lev1").transform(np.max) + def time_transform_str_max(self): + self.df.groupby(level="lev1").transform("max") def time_transform_lambda_max_tall(self): self.df_tall.groupby(level=0).transform(lambda x: np.max(x, axis=0)) @@ -810,7 +984,7 @@ def setup(self): self.df = DataFrame({"signal": np.random.rand(N)}) def time_transform_mean(self): - self.df["signal"].groupby(self.g).transform(np.mean) + self.df["signal"].groupby(self.g).transform("mean") class TransformNaN: @@ -826,7 +1000,6 @@ def time_first(self): class TransformEngine: - param_names = ["parallel"] params = [[True, False]] @@ -869,7 +1042,6 @@ def function(values): class AggEngine: - param_names = ["parallel"] params = [[True, False]] @@ -949,4 +1121,31 @@ def time_sample_weights(self): self.df.groupby(self.groups).sample(n=1, weights=self.weights) +class Resample: + # GH 28635 + def setup(self): + num_timedeltas = 20_000 + num_groups = 3 + + index = MultiIndex.from_product( + [ + np.arange(num_groups), + to_timedelta(np.arange(num_timedeltas), unit="s"), + ], + names=["groups", "timedeltas"], + ) + data = np.random.randint(0, 1000, size=(len(index))) + + self.df = DataFrame(data, index=index).reset_index("timedeltas") + self.df_multiindex = DataFrame(data, index=index) + + def time_resample(self): + self.df.groupby(level="groups").resample("10s", on="timedeltas").mean() + + def time_resample_multiindex(self): + self.df_multiindex.groupby(level="groups").resample( + "10s", level="timedeltas" + ).mean() + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index d9a291dc27125..d2c5b4dfbef70 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -39,17 +39,31 @@ def time_unique(self, exponent): pd.unique(self.a2) -class NumericSeriesIndexing: +class Unique: + params = ["Int64", "Float64"] + param_names = ["dtype"] + + def setup(self, dtype): + self.ser = pd.Series(([1, pd.NA, 2] + list(range(100_000))) * 3, dtype=dtype) + self.ser_unique = pd.Series(list(range(300_000)) + [pd.NA], dtype=dtype) + + def time_unique_with_duplicates(self, exponent): + pd.unique(self.ser) + + def time_unique(self, exponent): + pd.unique(self.ser_unique) + +class NumericSeriesIndexing: params = [ - (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (np.int64, np.uint64, np.float64), (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), ] - param_names = ["index_dtype", "N"] + param_names = ["dtype", "N"] - def setup(self, index, N): - vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) - indices = index(vals) + def setup(self, dtype, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype) + indices = pd.Index(vals) self.data = pd.Series(np.arange(N), index=indices) def time_loc_slice(self, index, N): @@ -58,17 +72,16 @@ def time_loc_slice(self, index, N): class NumericSeriesIndexingShuffled: - params = [ - (pd.Int64Index, pd.UInt64Index, pd.Float64Index), + (np.int64, np.uint64, np.float64), (10**4, 10**5, 5 * 10**5, 10**6, 5 * 10**6), ] - param_names = ["index_dtype", "N"] + param_names = ["dtype", "N"] - def setup(self, index, N): - vals = np.array(list(range(55)) + [54] + list(range(55, N - 1))) + def setup(self, dtype, N): + vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype) np.random.shuffle(vals) - indices = index(vals) + indices = pd.Index(vals) self.data = pd.Series(np.arange(N), index=indices) def time_loc_slice(self, index, N): diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py index 1a88bb7eef37a..d21bbe15c4cc8 100644 --- a/asv_bench/benchmarks/index_cached_properties.py +++ b/asv_bench/benchmarks/index_cached_properties.py @@ -25,14 +25,14 @@ def setup(self, index_type): N = 10**5 if index_type == "MultiIndex": self.idx = pd.MultiIndex.from_product( - [pd.date_range("1/1/2000", freq="T", periods=N // 2), ["a", "b"]] + [pd.date_range("1/1/2000", freq="min", periods=N // 2), ["a", "b"]] ) elif index_type == "DatetimeIndex": - self.idx = pd.date_range("1/1/2000", freq="T", periods=N) + self.idx = pd.date_range("1/1/2000", freq="min", periods=N) elif index_type == "Int64Index": - self.idx = pd.Index(range(N)) + self.idx = pd.Index(range(N), dtype="int64") elif index_type == "PeriodIndex": - self.idx = pd.period_range("1/1/2000", freq="T", periods=N) + self.idx = pd.period_range("1/1/2000", freq="min", periods=N) elif index_type == "RangeIndex": self.idx = pd.RangeIndex(start=0, stop=N) elif index_type == "IntervalIndex": @@ -40,9 +40,9 @@ def setup(self, index_type): elif index_type == "TimedeltaIndex": self.idx = pd.TimedeltaIndex(range(N)) elif index_type == "Float64Index": - self.idx = pd.Float64Index(range(N)) + self.idx = pd.Index(range(N), dtype="float64") elif index_type == "UInt64Index": - self.idx = pd.UInt64Index(range(N)) + self.idx = pd.Index(range(N), dtype="uint64") elif index_type == "CategoricalIndex": self.idx = pd.CategoricalIndex(range(N), range(N)) else: @@ -70,6 +70,3 @@ def time_engine(self, index_type): def time_inferred_type(self, index_type): self.idx.inferred_type - - def time_is_all_dates(self, index_type): - self.idx.is_all_dates diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index dab33f02c2cd9..9c1e9656503f7 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -4,7 +4,6 @@ from pandas import ( DatetimeIndex, - Float64Index, Index, IntervalIndex, MultiIndex, @@ -13,34 +12,41 @@ date_range, ) -from .pandas_vb_common import tm - class SetOperations: - params = ( - ["datetime", "date_string", "int", "strings"], + ["monotonic", "non_monotonic"], + ["datetime", "date_string", "int", "strings", "ea_int"], ["intersection", "union", "symmetric_difference"], ) - param_names = ["dtype", "method"] + param_names = ["index_structure", "dtype", "method"] - def setup(self, dtype, method): + def setup(self, index_structure, dtype, method): N = 10**5 - dates_left = date_range("1/1/2000", periods=N, freq="T") + dates_left = date_range("1/1/2000", periods=N, freq="min") fmt = "%Y-%m-%d %H:%M:%S" date_str_left = Index(dates_left.strftime(fmt)) int_left = Index(np.arange(N)) - str_left = tm.makeStringIndex(N) + ea_int_left = Index(np.arange(N), dtype="Int64") + str_left = Index([f"i-{i}" for i in range(N)], dtype=object) + data = { - "datetime": {"left": dates_left, "right": dates_left[:-1]}, - "date_string": {"left": date_str_left, "right": date_str_left[:-1]}, - "int": {"left": int_left, "right": int_left[:-1]}, - "strings": {"left": str_left, "right": str_left[:-1]}, + "datetime": dates_left, + "date_string": date_str_left, + "int": int_left, + "strings": str_left, + "ea_int": ea_int_left, } + + if index_structure == "non_monotonic": + data = {k: mi[::-1] for k, mi in data.items()} + + data = {k: {"left": idx, "right": idx[:-1]} for k, idx in data.items()} + self.left = data[dtype]["left"] self.right = data[dtype]["right"] - def time_operation(self, dtype, method): + def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) @@ -55,6 +61,15 @@ def time_datetime_difference_disjoint(self): self.datetime_left.difference(self.datetime_right) +class UnionWithDuplicates: + def setup(self): + self.left = Index(np.repeat(np.arange(1000), 100)) + self.right = Index(np.tile(np.arange(500, 1500), 50)) + + def time_union_with_duplicates(self): + self.left.union(self.right) + + class Range: def setup(self): self.idx_inc = RangeIndex(start=0, stop=10**6, step=3) @@ -107,7 +122,6 @@ def time_non_object_equals_multiindex(self): class IndexAppend: def setup(self): - N = 10_000 self.range_idx = RangeIndex(0, 100) self.int_idx = self.range_idx.astype(int) @@ -122,10 +136,14 @@ def setup(self): self.int_idxs.append(i_idx) o_idx = i_idx.astype(str) self.object_idxs.append(o_idx) + self.same_range_idx = [self.range_idx] * N def time_append_range_list(self): self.range_idx.append(self.range_idxs) + def time_append_range_list_same(self): + self.range_idx.append(self.same_range_idx) + def time_append_int_list(self): self.int_idx.append(self.int_idxs) @@ -134,21 +152,23 @@ def time_append_obj_list(self): class Indexing: - params = ["String", "Float", "Int"] param_names = ["dtype"] def setup(self, dtype): N = 10**6 - self.idx = getattr(tm, f"make{dtype}Index")(N) + if dtype == "String": + self.idx = Index([f"i-{i}" for i in range(N)], dtype=object) + elif dtype == "Float": + self.idx = Index(np.arange(N), dtype=np.float64) + elif dtype == "Int": + self.idx = Index(np.arange(N), dtype=np.int64) self.array_mask = (np.arange(N) % 3) == 0 self.series_mask = Series(self.array_mask) self.sorted = self.idx.sort_values() half = N // 2 self.non_unique = self.idx[:half].append(self.idx[:half]) - self.non_unique_sorted = ( - self.sorted[:half].append(self.sorted[:half]).sort_values() - ) + self.non_unique_sorted = self.sorted[:half].repeat(2) self.key = self.sorted[N // 4] def time_boolean_array(self, dtype): @@ -183,8 +203,8 @@ class Float64IndexMethod: # GH 13166 def setup(self): N = 100_000 - a = np.arange(N) - self.ind = Float64Index(a * 4.8000000418824129e-08) + a = np.arange(N, dtype=np.float64) + self.ind = Index(a * 4.8000000418824129e-08) def time_get_loc(self): self.ind.get_loc(0) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 69e3d166943a8..b2495356f134c 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -3,44 +3,40 @@ lower-level methods directly on Index and subclasses, see index_object.py, indexing_engine.py, and index_cached.py """ -import itertools -import string + +from datetime import datetime import warnings import numpy as np from pandas import ( + NA, CategoricalIndex, DataFrame, - Float64Index, - Int64Index, + Index, IntervalIndex, MultiIndex, Series, - UInt64Index, concat, date_range, option_context, period_range, ) -from .pandas_vb_common import tm - class NumericSeriesIndexing: - params = [ - (Int64Index, UInt64Index, Float64Index), + (np.int64, np.uint64, np.float64), ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ["index_dtype", "index_structure"] + param_names = ["dtype", "index_structure"] - def setup(self, index, index_structure): + def setup(self, dtype, index_structure): N = 10**6 indices = { - "unique_monotonic_inc": index(range(N)), - "nonunique_monotonic_inc": index( - list(range(55)) + [54] + list(range(55, N - 1)) + "unique_monotonic_inc": Index(range(N), dtype=dtype), + "nonunique_monotonic_inc": Index( + list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype ), } self.data = Series(np.random.rand(N), index=indices[index_structure]) @@ -87,8 +83,35 @@ def time_loc_slice(self, index, index_structure): self.data.loc[:800000] -class NonNumericSeriesIndexing: +class NumericMaskedIndexing: + monotonic_list = list(range(10**6)) + non_monotonic_list = list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) + + params = [ + ("Int64", "UInt64", "Float64"), + (True, False), + ] + param_names = ["dtype", "monotonic"] + + def setup(self, dtype, monotonic): + indices = { + True: Index(self.monotonic_list, dtype=dtype), + False: Index(self.non_monotonic_list, dtype=dtype).append( + Index([NA], dtype=dtype) + ), + } + self.data = indices[monotonic] + self.indexer = np.arange(300, 1_000) + self.data_dups = self.data.append(self.data) + + def time_get_indexer(self, dtype, monotonic): + self.data.get_indexer(self.indexer) + def time_get_indexer_dups(self, dtype, monotonic): + self.data.get_indexer_for(self.indexer) + + +class NonNumericSeriesIndexing: params = [ ("string", "datetime", "period"), ("unique_monotonic_inc", "nonunique_monotonic_inc", "non_monotonic"), @@ -98,7 +121,7 @@ class NonNumericSeriesIndexing: def setup(self, index, index_structure): N = 10**6 if index == "string": - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) elif index == "datetime": index = date_range("1900", periods=N, freq="s") elif index == "period": @@ -130,8 +153,8 @@ def time_getitem_list_like(self, index, index_structure): class DataFrameStringIndexing: def setup(self): - index = tm.makeStringIndex(1000) - columns = tm.makeStringIndex(30) + index = Index([f"i-{i}" for i in range(1000)], dtype=object) + columns = Index([f"i-{i}" for i in range(30)], dtype=object) with warnings.catch_warnings(record=True): self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns) self.idx_scalar = index[100] @@ -143,6 +166,12 @@ def setup(self): def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] + def time_at(self): + self.df.at[self.idx_scalar, self.col_scalar] + + def time_at_setitem(self): + self.df.at[self.idx_scalar, self.col_scalar] = 0.0 + def time_getitem_scalar(self): self.df[self.col_scalar][self.idx_scalar] @@ -157,19 +186,18 @@ def time_boolean_rows_boolean(self): class DataFrameNumericIndexing: - params = [ - (Int64Index, UInt64Index, Float64Index), + (np.int64, np.uint64, np.float64), ("unique_monotonic_inc", "nonunique_monotonic_inc"), ] - param_names = ["index_dtype", "index_structure"] + param_names = ["dtype", "index_structure"] - def setup(self, index, index_structure): + def setup(self, dtype, index_structure): N = 10**5 indices = { - "unique_monotonic_inc": index(range(N)), - "nonunique_monotonic_inc": index( - list(range(55)) + [54] + list(range(55, N - 1)) + "unique_monotonic_inc": Index(range(N), dtype=dtype), + "nonunique_monotonic_inc": Index( + list(range(55)) + [54] + list(range(55, N - 1)), dtype=dtype ), } self.idx_dupe = np.array(range(30)) * 99 @@ -194,15 +222,14 @@ def time_bool_indexer(self, index, index_structure): class Take: - params = ["int", "datetime"] param_names = ["index"] def setup(self, index): N = 100000 indexes = { - "int": Int64Index(np.arange(N)), - "datetime": date_range("2011-01-01", freq="S", periods=N), + "int": Index(np.arange(N), dtype=np.int64), + "datetime": date_range("2011-01-01", freq="s", periods=N), } index = indexes[index] self.s = Series(np.random.rand(N), index=index) @@ -213,7 +240,6 @@ def time_take(self, index): class MultiIndexing: - params = [True, False] param_names = ["unique_levels"] @@ -277,6 +303,10 @@ def time_loc_null_slice_plus_slice(self, unique_levels): target = (self.tgt_null_slice, self.tgt_slice) self.df.loc[target, :] + def time_loc_multiindex(self, unique_levels): + target = self.df.index[::10] + self.df.loc[target] + def time_xs_level_0(self, unique_levels): target = self.tgt_scalar self.df.xs(target, level=0) @@ -342,7 +372,6 @@ def time_loc_sorted(self): class CategoricalIndexIndexing: - params = ["monotonic_incr", "monotonic_decr", "non_monotonic"] param_names = ["index"] @@ -355,15 +384,13 @@ def setup(self, index): "non_monotonic": CategoricalIndex(list("abc" * N)), } self.data = indices[index] - self.data_unique = CategoricalIndex( - ["".join(perm) for perm in itertools.permutations(string.printable, 3)] - ) + self.data_unique = CategoricalIndex([str(i) for i in range(N * 3)]) self.int_scalar = 10000 self.int_list = list(range(10000)) self.cat_scalar = "b" - self.cat_list = ["a", "c"] + self.cat_list = ["1", "3"] def time_getitem_scalar(self, index): self.data[self.int_scalar] @@ -439,7 +466,7 @@ def time_loc_row(self, unique_cols): class AssignTimeseriesIndex: def setup(self): N = 100000 - idx = date_range("1/1/2000", periods=N, freq="H") + idx = date_range("1/1/2000", periods=N, freq="h") self.df = DataFrame(np.random.randn(N, 1), columns=["A"], index=idx) def time_frame_assign_timeseries_index(self): @@ -476,8 +503,32 @@ def time_assign_list_of_columns_concat(self): concat([self.df, df], axis=1) -class ChainIndexing: +class Setitem: + def setup(self): + N = 500_000 + cols = 500 + self.df = DataFrame(np.random.rand(N, cols)) + + def time_setitem(self): + self.df[100] = 100 + + def time_setitem_list(self): + self.df[[100, 200, 300]] = 100 + + +class SetitemObjectDtype: + # GH#19299 + def setup(self): + N = 1000 + cols = 500 + self.df = DataFrame(index=range(N), columns=range(cols), dtype=object) + + def time_setitem_object_dtype(self): + self.df.loc[0, 1] = 1.0 + + +class ChainIndexing: params = [None, "warn"] param_names = ["mode"] @@ -494,4 +545,18 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 +class Block: + def setup(self): + self.df = DataFrame( + False, + columns=np.arange(500).astype(str), + index=date_range("2010-01-01", "2011-01-01"), + ) + + def time_test(self): + start = datetime(2010, 5, 1) + end = datetime(2010, 9, 1) + self.df.loc[start:end, :] = True + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 0c6cb89f49da1..5e3c593e269cb 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,8 @@ """ -Benchmarks in this file depend exclusively on code in _libs/ +Benchmarks in this file depend mostly on code in _libs/ + +We have to created masked arrays to test the masked engine though. The +array is unpacked on the Cython level. If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. @@ -9,6 +12,8 @@ from pandas._libs import index as libindex +from pandas.core.arrays import BaseMaskedArray + def _get_numeric_engines(): engine_names = [ @@ -30,8 +35,27 @@ def _get_numeric_engines(): ] -class NumericEngineIndexing: +def _get_masked_engines(): + engine_names = [ + ("MaskedInt64Engine", "Int64"), + ("MaskedInt32Engine", "Int32"), + ("MaskedInt16Engine", "Int16"), + ("MaskedInt8Engine", "Int8"), + ("MaskedUInt64Engine", "UInt64"), + ("MaskedUInt32Engine", "UInt32"), + ("MaskedUInt16engine", "UInt16"), + ("MaskedUInt8Engine", "UInt8"), + ("MaskedFloat64Engine", "Float64"), + ("MaskedFloat32Engine", "Float32"), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) + ] + +class NumericEngineIndexing: params = [ _get_numeric_engines(), ["monotonic_incr", "monotonic_decr", "non_monotonic"], @@ -47,14 +71,12 @@ def setup(self, engine_and_dtype, index_type, unique, N): if unique: arr = np.arange(N * 3, dtype=dtype) else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) elif index_type == "monotonic_decr": if unique: arr = np.arange(N * 3, dtype=dtype)[::-1] else: - values = list([1] * N + [2] * N + [3] * N) - arr = np.array(values, dtype=dtype)[::-1] + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) else: assert index_type == "non_monotonic" if unique: @@ -62,10 +84,10 @@ def setup(self, engine_and_dtype, index_type, unique, N): arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) arr[N:] = np.arange(N * 2, dtype=dtype) else: - arr = np.array([1, 2, 3] * N, dtype=dtype) + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) self.data = engine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc(2) self.key_middle = arr[len(arr) // 2] @@ -80,8 +102,60 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): self.data.get_loc(self.key_middle) -class ObjectEngineIndexing: +class MaskedNumericEngineIndexing: + params = [ + _get_masked_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF + ] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] + def setup(self, engine_and_dtype, index_type, unique, N): + engine, dtype = engine_and_dtype + dtype = dtype.lower() + + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype) + else: + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) + mask = np.zeros(N * 3, dtype=np.bool_) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype)[::-1] + else: + arr = np.array([3, 2, 1], dtype=dtype).repeat(N) + mask = np.zeros(N * 3, dtype=np.bool_) + else: + assert index_type == "non_monotonic" + if unique: + arr = np.zeros(N * 3, dtype=dtype) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype) + arr[N:] = np.arange(N * 2, dtype=dtype) + + else: + arr = np.array([1, 2, 3], dtype=dtype).repeat(N) + mask = np.zeros(N * 3, dtype=np.bool_) + mask[-1] = True + + self.data = engine(BaseMaskedArray(arr, mask)) + # code below avoids populating the mapping etc. while timing. + self.data.get_loc(2) + + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) + + +class ObjectEngineIndexing: params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] param_names = ["index_type"] @@ -95,7 +169,7 @@ def setup(self, index_type): }[index_type] self.data = libindex.ObjectEngine(arr) - # code belows avoids populating the mapping etc. while timing. + # code below avoids populating the mapping etc. while timing. self.data.get_loc("b") def time_get_loc(self, index_type): diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0bbb599f2b045..ce3935d2cd0ac 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -9,6 +9,7 @@ import numpy as np from pandas import ( + Index, NaT, Series, date_range, @@ -17,35 +18,27 @@ to_timedelta, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib class ToNumeric: - - params = ["ignore", "coerce"] - param_names = ["errors"] - - def setup(self, errors): + def setup(self): N = 10000 self.float = Series(np.random.randn(N)) self.numstr = self.float.astype("str") - self.str = Series(tm.makeStringIndex(N)) + self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) - def time_from_float(self, errors): - to_numeric(self.float, errors=errors) + def time_from_float(self): + to_numeric(self.float, errors="coerce") - def time_from_numeric_str(self, errors): - to_numeric(self.numstr, errors=errors) + def time_from_numeric_str(self): + to_numeric(self.numstr, errors="coerce") - def time_from_str(self, errors): - to_numeric(self.str, errors=errors) + def time_from_str(self): + to_numeric(self.str, errors="coerce") class ToNumericDowncast: - param_names = ["dtype", "downcast"] params = [ [ @@ -153,7 +146,6 @@ def time_format_YYYYMMDD(self): class ToDatetimeCacheSmallCount: - params = ([True, False], [50, 500, 5000, 100000]) param_names = ["cache", "count"] @@ -167,7 +159,7 @@ def time_unique_date_strings(self, cache, count): class ToDatetimeISO8601: def setup(self): - rng = date_range(start="1/1/2000", periods=20000, freq="H") + rng = date_range(start="1/1/2000", periods=20000, freq="h") self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() self.strings_nosep = rng.strftime("%Y%m%d %H:%M:%S").tolist() self.strings_tz_space = [ @@ -192,7 +184,7 @@ def time_iso8601_tz_spaceformat(self): def time_iso8601_infer_zero_tz_fromat(self): # GH 41047 - to_datetime(self.strings_zero_tz, infer_datetime_format=True) + to_datetime(self.strings_zero_tz) class ToDatetimeNONISO8601: @@ -208,7 +200,7 @@ def time_same_offset(self): to_datetime(self.same_offset) def time_different_offset(self): - to_datetime(self.diff_offset) + to_datetime(self.diff_offset, utc=True) class ToDatetimeFormatQuarters: @@ -223,7 +215,7 @@ class ToDatetimeFormat: def setup(self): N = 100000 self.s = Series(["19MAY11", "19MAY11:00:00:00"] * N) - self.s2 = self.s.str.replace(":\\S+$", "") + self.s2 = self.s.str.replace(":\\S+$", "", regex=True) self.same_offset = ["10/11/2018 00:00:00.045-07:00"] * N self.diff_offset = [ @@ -239,9 +231,6 @@ def time_no_exact(self): def time_same_offset(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - def time_different_offset(self): - to_datetime(self.diff_offset, format="%m/%d/%Y %H:%M:%S.%f%z") - def time_same_offset_to_utc(self): to_datetime(self.same_offset, format="%m/%d/%Y %H:%M:%S.%f%z", utc=True) @@ -250,7 +239,6 @@ def time_different_offset_to_utc(self): class ToDatetimeCache: - params = [True, False] param_names = ["cache"] @@ -277,16 +265,6 @@ def time_dup_string_tzoffset_dates(self, cache): to_datetime(self.dup_string_with_tz, cache=cache) -# GH 43901 -class ToDatetimeInferDatetimeFormat: - def setup(self): - rng = date_range(start="1/1/2000", periods=100000, freq="H") - self.strings = rng.strftime("%Y-%m-%d %H:%M:%S").tolist() - - def time_infer_datetime_format(self): - to_datetime(self.strings, infer_datetime_format=True) - - class ToTimedelta: def setup(self): self.ints = np.random.randint(0, 60, size=10000) @@ -307,17 +285,13 @@ def time_convert_string_seconds(self): class ToTimedeltaErrors: - - params = ["coerce", "ignore"] - param_names = ["errors"] - - def setup(self, errors): + def setup(self): ints = np.random.randint(0, 60, size=10000) self.arr = [f"{i} days" for i in ints] self.arr[-1] = "apple" - def time_convert(self, errors): - to_timedelta(self.arr, errors=errors) + def time_convert(self): + to_timedelta(self.arr, errors="coerce") from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 10aef954a3475..3a15f754ae523 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -10,20 +10,18 @@ from pandas import ( Categorical, DataFrame, + Index, concat, date_range, + period_range, read_csv, to_datetime, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ToCSV(BaseIO): - fname = "__test__.csv" params = ["wide", "long", "mixed"] param_names = ["kind"] @@ -56,7 +54,6 @@ def time_frame(self, kind): class ToCSVMultiIndexUnusedLevels(BaseIO): - fname = "__test__.csv" def setup(self): @@ -76,7 +73,6 @@ def time_single_index_frame(self): class ToCSVDatetime(BaseIO): - fname = "__test__.csv" def setup(self): @@ -88,11 +84,10 @@ def time_frame_date_formatting(self): class ToCSVDatetimeIndex(BaseIO): - fname = "__test__.csv" def setup(self): - rng = date_range("2000", periods=100_000, freq="S") + rng = date_range("2000", periods=100_000, freq="s") self.data = DataFrame({"a": 1}, index=rng) def time_frame_date_formatting_index(self): @@ -102,30 +97,80 @@ def time_frame_date_no_format_index(self): self.data.to_csv(self.fname) -class ToCSVDatetimeBig(BaseIO): +class ToCSVPeriod(BaseIO): + fname = "__test__.csv" + + params = ([1000, 10000], ["D", "h"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + rng = period_range(start="2000-01-01", periods=nobs, freq=freq) + self.data = DataFrame(rng) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "h": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_formatting_default(self, nobs, freq): + self.data.to_csv(self.fname) + + def time_frame_period_formatting_default_explicit(self, nobs, freq): + self.data.to_csv(self.fname, date_format=self.default_fmt) + + def time_frame_period_formatting(self, nobs, freq): + # Nb: `date_format` is not actually taken into account here today, so the + # performance is currently identical to `time_frame_period_formatting_default` + # above. This timer is therefore expected to degrade when GH#51621 is fixed. + # (Remove this comment when GH#51621 is fixed.) + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + + +class ToCSVPeriodIndex(BaseIO): + fname = "__test__.csv" + + params = ([1000, 10000], ["D", "h"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + rng = period_range(start="2000-01-01", periods=nobs, freq=freq) + self.data = DataFrame({"a": 1}, index=rng) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "h": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_formatting_index(self, nobs, freq): + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + + def time_frame_period_formatting_index_default(self, nobs, freq): + self.data.to_csv(self.fname) + + def time_frame_period_formatting_index_default_explicit(self, nobs, freq): + self.data.to_csv(self.fname, date_format=self.default_fmt) + +class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = DataFrame( { - "dt": [np.datetime64(dt)] * obs, - "d": [np.datetime64(d)] * obs, - "r": [np.random.uniform()] * obs, + "dt": [np.datetime64(dt)] * nobs, + "d": [np.datetime64(d)] * nobs, + "r": [np.random.uniform()] * nobs, } ) - def time_frame(self, obs): + def time_frame(self, nobs): self.data.to_csv(self.fname) class ToCSVIndexes(BaseIO): - fname = "__test__.csv" @staticmethod @@ -179,13 +224,13 @@ def data(self, stringio_object): class ReadCSVDInferDatetimeFormat(StringIORewind): + params = [None, "custom", "iso8601", "ymd"] + param_names = ["format"] - params = ([True, False], ["custom", "iso8601", "ymd"]) - param_names = ["infer_datetime_format", "format"] - - def setup(self, infer_datetime_format, format): + def setup(self, format): rng = date_range("1/1/2000", periods=1000) formats = { + None: None, "custom": "%m/%d/%Y %H:%M:%S.%f", "iso8601": "%Y-%m-%d %H:%M:%S", "ymd": "%Y%m%d", @@ -193,22 +238,20 @@ def setup(self, infer_datetime_format, format): dt_format = formats[format] self.StringIO_input = StringIO("\n".join(rng.strftime(dt_format).tolist())) - def time_read_csv(self, infer_datetime_format, format): + def time_read_csv(self, format): read_csv( self.data(self.StringIO_input), header=None, names=["foo"], parse_dates=["foo"], - infer_datetime_format=infer_datetime_format, ) class ReadCSVConcatDatetime(StringIORewind): - iso8601 = "%Y-%m-%d %H:%M:%S" def setup(self): - rng = date_range("1/1/2000", periods=50000, freq="S") + rng = date_range("1/1/2000", periods=50000, freq="s") self.StringIO_input = StringIO("\n".join(rng.strftime(self.iso8601).tolist())) def time_read_csv(self): @@ -217,12 +260,10 @@ def time_read_csv(self): header=None, names=["foo"], parse_dates=["foo"], - infer_datetime_format=False, ) class ReadCSVConcatDatetimeBadDateValue(StringIORewind): - params = (["nan", "0", ""],) param_names = ["bad_date_value"] @@ -235,19 +276,17 @@ def time_read_csv(self, bad_date_value): header=None, names=["foo", "bar"], parse_dates=["foo"], - infer_datetime_format=False, ) class ReadCSVSkipRows(BaseIO): - fname = "__test__.csv" params = ([None, 10000], ["c", "python", "pyarrow"]) param_names = ["skiprows", "engine"] def setup(self, skiprows, engine): N = 20000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) df = DataFrame( { "float1": np.random.randn(N), @@ -286,7 +325,6 @@ def time_read_uint64_na_values(self): class ReadCSVThousands(BaseIO): - fname = "__test__.csv" params = ([",", "|"], [None, ","], ["c", "python"]) param_names = ["sep", "thousands", "engine"] @@ -299,7 +337,7 @@ def setup(self, sep, thousands, engine): if thousands is not None: fmt = f":{thousands}" fmt = "{" + fmt + "}" - df = df.applymap(lambda x: fmt.format(x)) + df = df.map(lambda x: fmt.format(x)) df.to_csv(self.fname, sep=sep) def time_thousands(self, sep, thousands, engine): @@ -321,7 +359,6 @@ def time_comment(self, engine): class ReadCSVFloatPrecision(StringIORewind): - params = ([",", ";"], [".", "_"], [None, "high", "round_trip"]) param_names = ["sep", "decimal", "float_precision"] @@ -330,7 +367,7 @@ def setup(self, sep, decimal, float_precision): "".join([random.choice(string.digits) for _ in range(28)]) for _ in range(15) ] - rows = sep.join([f"0{decimal}" + "{}"] * 3) + "\n" + rows = sep.join([f"0{decimal}{{}}"] * 3) + "\n" data = rows * 5 data = data.format(*floats) * 200 # 1000 x 3 strings csv self.StringIO_input = StringIO(data) @@ -371,9 +408,11 @@ def time_read_stringcsv(self, engine): def time_read_bytescsv(self, engine): read_csv(self.data(self.BytesIO_input), engine=engine) + def peakmem_read_csv(self, engine): + read_csv(self.data(self.BytesIO_input), engine=engine) -class ReadCSVCategorical(BaseIO): +class ReadCSVCategorical(BaseIO): fname = "__test__.csv" params = ["c", "python"] param_names = ["engine"] @@ -406,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), @@ -450,7 +479,6 @@ def time_read_csv_cached(self, do_cache, engine): class ReadCSVMemoryGrowth(BaseIO): - chunksize = 20 num_rows = 1000 fname = "__test__.csv" @@ -458,7 +486,7 @@ class ReadCSVMemoryGrowth(BaseIO): param_names = ["engine"] def setup(self, engine): - with open(self.fname, "w") as f: + with open(self.fname, "w", encoding="utf-8") as f: for i in range(self.num_rows): f.write(f"{i}\n") @@ -496,7 +524,6 @@ def time_read_special_date(self, value, engine): class ReadCSVMemMapUTF8: - fname = "__test__.csv" number = 5 @@ -567,7 +594,33 @@ def setup(self): self.StringIO_input = StringIO(data) def time_read_csv_index_col(self): - read_csv(self.StringIO_input, index_col="a") + read_csv(self.data(self.StringIO_input), index_col="a") + + +class ReadCSVDatePyarrowEngine(StringIORewind): + def setup(self): + count_elem = 100_000 + data = "a\n" + "2019-12-31\n" * count_elem + self.StringIO_input = StringIO(data) + + def time_read_csv_index_col(self): + read_csv( + self.data(self.StringIO_input), + parse_dates=["a"], + engine="pyarrow", + dtype_backend="pyarrow", + ) + + +class ReadCSVCParserLowMemory: + # GH 16798 + def setup(self): + self.csv = StringIO( + "strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]) + ) + + def peakmem_over_2gb_input(self): + read_csv(self.csv, engine="c", low_memory=False) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index a88c4374b7030..902a61be901bd 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -12,12 +12,11 @@ from pandas import ( DataFrame, ExcelWriter, + Index, date_range, read_excel, ) -from ..pandas_vb_common import tm - def _generate_dataframe(): N = 2000 @@ -25,15 +24,14 @@ def _generate_dataframe(): df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - df["object"] = tm.makeStringIndex(N) + df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) return df class WriteExcel: - - params = ["openpyxl", "xlsxwriter", "xlwt"] + params = ["openpyxl", "xlsxwriter"] param_names = ["engine"] def setup(self, engine): @@ -42,9 +40,8 @@ def setup(self, engine): def time_write_excel(self, engine): bio = BytesIO() bio.seek(0) - writer = ExcelWriter(bio, engine=engine) - self.df.to_excel(writer, sheet_name="Sheet1") - writer.save() + with ExcelWriter(bio, engine=engine) as writer: + self.df.to_excel(writer, sheet_name="Sheet1") class WriteExcelStyled: @@ -57,21 +54,18 @@ def setup(self, engine): def time_write_excel_style(self, engine): bio = BytesIO() bio.seek(0) - writer = ExcelWriter(bio, engine=engine) - df_style = self.df.style - df_style.applymap(lambda x: "border: red 1px solid;") - df_style.applymap(lambda x: "color: blue") - df_style.applymap(lambda x: "border-color: green black", subset=["float1"]) - df_style.to_excel(writer, sheet_name="Sheet1") - writer.save() + with ExcelWriter(bio, engine=engine) as writer: + df_style = self.df.style + df_style.map(lambda x: "border: red 1px solid;") + df_style.map(lambda x: "color: blue") + df_style.map(lambda x: "border-color: green black", subset=["float1"]) + df_style.to_excel(writer, sheet_name="Sheet1") class ReadExcel: - - params = ["xlrd", "openpyxl", "odf"] + params = ["openpyxl", "odf"] param_names = ["engine"] fname_excel = "spreadsheet.xlsx" - fname_excel_xls = "spreadsheet.xls" fname_odf = "spreadsheet.ods" def _create_odf(self): @@ -92,13 +86,10 @@ def setup_cache(self): self.df = _generate_dataframe() self.df.to_excel(self.fname_excel, sheet_name="Sheet1") - self.df.to_excel(self.fname_excel_xls, sheet_name="Sheet1") self._create_odf() def time_read_excel(self, engine): - if engine == "xlrd": - fname = self.fname_excel_xls - elif engine == "odf": + if engine == "odf": fname = self.fname_odf else: fname = self.fname_excel @@ -107,9 +98,7 @@ def time_read_excel(self, engine): class ReadExcelNRows(ReadExcel): def time_read_excel(self, engine): - if engine == "xlrd": - fname = self.fname_excel_xls - elif engine == "odf": + if engine == "odf": fname = self.fname_odf else: fname = self.fname_excel diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 4a2c1c872e6eb..2eb4c8c7f674b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -3,20 +3,18 @@ from pandas import ( DataFrame, HDFStore, + Index, date_range, read_hdf, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class HDFStoreDataFrame(BaseIO): def setup(self): N = 25000 - index = tm.makeStringIndex(N) + index = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame( {"float1": np.random.randn(N), "float2": np.random.randn(N)}, index=index ) @@ -43,7 +41,7 @@ def setup(self): np.random.randn(N, 100), index=date_range("1/1/2000", periods=N) ) self.df_dc = DataFrame( - np.random.randn(N, 10), columns=["C%03d" % i for i in range(10)] + np.random.randn(N, 10), columns=[f"C{i:03d}" for i in range(10)] ) self.fname = "__test__.h5" @@ -112,7 +110,6 @@ def time_store_info(self): class HDF(BaseIO): - params = ["table", "fixed"] param_names = ["format"] @@ -123,16 +120,24 @@ def setup(self, format): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) - self.df.to_hdf(self.fname, "df", format=format) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) + self.df.to_hdf(self.fname, key="df", format=format) + + # Numeric df + self.df1 = self.df.copy() + self.df1 = self.df1.reset_index() + self.df1.to_hdf(self.fname, key="df1", format=format) def time_read_hdf(self, format): read_hdf(self.fname, "df") + def peakmem_read_hdf(self, format): + read_hdf(self.fname, "df") + def time_write_hdf(self, format): - self.df.to_hdf(self.fname, "df", format=format) + self.df.to_hdf(self.fname, key="df", format=format) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index bb09fe0ff634d..bcbfcdea42dd9 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,6 +4,7 @@ from pandas import ( DataFrame, + Index, concat, date_range, json_normalize, @@ -11,14 +12,10 @@ timedelta_range, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class ReadJSON(BaseIO): - fname = "__test__.json" params = (["split", "index", "records"], ["int", "datetime"]) param_names = ["orient", "index"] @@ -27,7 +24,7 @@ def setup(self, orient, index): N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -41,7 +38,6 @@ def time_read_json(self, orient, index): class ReadJSONLines(BaseIO): - fname = "__test_lines__.json" params = ["int", "datetime"] param_names = ["index"] @@ -50,7 +46,7 @@ def setup(self, index): N = 100000 indexes = { "int": np.arange(N), - "datetime": date_range("20000101", periods=N, freq="H"), + "datetime": date_range("20000101", periods=N, freq="h"), } df = DataFrame( np.random.randn(N, 5), @@ -100,7 +96,6 @@ def time_normalize_json(self, orient, frame): class ToJSON(BaseIO): - fname = "__test__.json" params = [ ["split", "columns", "index", "values", "records"], @@ -111,13 +106,13 @@ class ToJSON(BaseIO): def setup(self, orient, frame): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( @@ -194,7 +189,7 @@ class ToJSONISO(BaseIO): def setup(self, orient): N = 10**5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") self.df = DataFrame( @@ -212,19 +207,18 @@ def time_iso_format(self, orient): class ToJSONLines(BaseIO): - fname = "__test__.json" def setup(self): N = 10**5 ncols = 5 - index = date_range("20000101", periods=N, freq="H") + index = date_range("20000101", periods=N, freq="h") timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) - strings = tm.makeStringIndex(N) + strings = Index([f"i-{i}" for i in range(N)], dtype=object) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) self.df_td_int_ts = DataFrame( @@ -294,7 +288,8 @@ def time_float_longint_str_lines(self): class ToJSONMem: def setup_cache(self): df = DataFrame([[1]]) - frames = {"int": df, "float": df.astype(float)} + df2 = DataFrame(range(8), date_range("1/1/2000", periods=8, freq="min")) + frames = {"int": df, "float": df.astype(float), "datetime": df2} return frames @@ -308,5 +303,10 @@ def peakmem_float(self, frames): for _ in range(100_000): df.to_json() + def peakmem_time(self, frames): + df = frames["datetime"] + for _ in range(10_000): + df.to_json(orient="table") + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 5390056ba36f2..d3fd5075a4707 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,17 +1,11 @@ -import numpy as np - try: - from pandas._libs.tslibs.parsing import ( - _does_string_look_like_datetime, - concat_date_cols, - ) + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass class DoesStringLookLikeDatetime: - params = (["2Q2005", "0.0", "10000"],) param_names = ["value"] @@ -21,22 +15,3 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: _does_string_look_like_datetime(obj) - - -class ConcatDateCols: - - params = ([1234567890, "AAAA"], [1, 2]) - param_names = ["value", "dim"] - - def setup(self, value, dim): - count_elem = 10000 - if dim == 1: - self.object = (np.array([value] * count_elem),) - if dim == 2: - self.object = ( - np.array([value] * count_elem), - np.array([value] * count_elem), - ) - - def time_check_concat(self, value, dim): - concat_date_cols(self.object) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index c71cdcdcc5c59..4787b57b54756 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -2,14 +2,12 @@ from pandas import ( DataFrame, + Index, date_range, read_pickle, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Pickle(BaseIO): @@ -20,9 +18,9 @@ def setup(self): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(N) + self.df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object) self.df.to_pickle(self.fname) def time_read_pickle(self): diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py index 369b79641dbc4..411e5b6099f76 100644 --- a/asv_bench/benchmarks/io/sas.py +++ b/asv_bench/benchmarks/io/sas.py @@ -1,30 +1,23 @@ -import os +from pathlib import Path from pandas import read_sas +ROOT = Path(__file__).parents[3] / "pandas" / "tests" / "io" / "sas" / "data" + class SAS: + def time_read_sas7bdat(self): + read_sas(ROOT / "test1.sas7bdat") - params = ["sas7bdat", "xport"] - param_names = ["format"] + def time_read_xpt(self): + read_sas(ROOT / "paxraw_d_short.xpt") - def setup(self, format): - # Read files that are located in 'pandas/tests/io/sas/data' - files = {"sas7bdat": "test1.sas7bdat", "xport": "paxraw_d_short.xpt"} - file = files[format] - paths = [ - os.path.dirname(__file__), - "..", - "..", - "..", - "pandas", - "tests", - "io", - "sas", - "data", - file, - ] - self.f = os.path.join(*paths) + def time_read_sas7bdat_2(self): + next(read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=11000)) - def time_read_sas(self, format): - read_sas(self.f, format=format) + def time_read_sas7bdat_2_chunked(self): + for i, _ in enumerate( + read_sas(ROOT / "0x00controlbyte.sas7bdat.bz2", chunksize=1000) + ): + if i == 10: + break diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index fb8b7dafa0ade..e87cc4aaa80c7 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -5,16 +5,14 @@ from pandas import ( DataFrame, + Index, date_range, read_sql_query, read_sql_table, ) -from ..pandas_vb_common import tm - class SQL: - params = ["sqlalchemy", "sqlite"] param_names = ["connection"] @@ -36,9 +34,9 @@ def setup(self, connection): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) @@ -52,7 +50,6 @@ def time_read_sql_query(self, connection): class WriteSQLDtypes: - params = ( ["sqlalchemy", "sqlite"], [ @@ -86,9 +83,9 @@ def setup(self, connection, dtype): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) @@ -115,9 +112,9 @@ def setup(self): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) @@ -136,7 +133,6 @@ def time_read_sql_table_parse_dates(self): class ReadSQLTableDtypes: - params = [ "float", "float_with_nan", @@ -162,9 +158,9 @@ def setup(self, dtype): "int": np.random.randint(0, N, size=N), "datetime": date_range("2000-01-01", periods=N, freq="s"), }, - index=tm.makeStringIndex(N), + index=Index([f"i-{i}" for i in range(N)], dtype=object), ) - self.df.loc[1000:3000, "float_with_nan"] = np.nan + self.df.iloc[1000:3000, 1] = np.nan self.df["date"] = self.df["datetime"].dt.date self.df["time"] = self.df["datetime"].dt.time self.df["datetime_string"] = self.df["datetime"].astype(str) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index 4ae2745af8bff..ff33ededdfed9 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -2,18 +2,15 @@ from pandas import ( DataFrame, + Index, date_range, read_stata, ) -from ..pandas_vb_common import ( - BaseIO, - tm, -) +from ..pandas_vb_common import BaseIO class Stata(BaseIO): - params = ["tc", "td", "tm", "tw", "th", "tq", "ty"] param_names = ["convert_dates"] @@ -24,9 +21,9 @@ def setup(self, convert_dates): self.df = DataFrame( np.random.randn(N, C), columns=[f"float{i}" for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), + index=date_range("20000101", periods=N, freq="h"), ) - self.df["object"] = tm.makeStringIndex(self.N) + self.df["object"] = Index([f"i-{i}" for i in range(self.N)], dtype=object) self.df["int8_"] = np.random.randint( np.iinfo(np.int8).min, np.iinfo(np.int8).max - 27, N ) @@ -38,13 +35,13 @@ def setup(self, convert_dates): ) self.df["float32_"] = np.array(np.random.randn(N), dtype=np.float32) self.convert_dates = {"index": convert_dates} - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) def time_read_stata(self, convert_dates): read_stata(self.fname) def time_write_stata(self, convert_dates): - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) class StataMissing(Stata): @@ -54,7 +51,7 @@ def setup(self, convert_dates): missing_data = np.random.randn(self.N) missing_data[missing_data < 0] = np.nan self.df[f"missing_{i}"] = missing_data - self.df.to_stata(self.fname, self.convert_dates) + self.df.to_stata(self.fname, convert_dates=self.convert_dates) from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py index f0902c9c2c328..0486cabb29845 100644 --- a/asv_bench/benchmarks/io/style.py +++ b/asv_bench/benchmarks/io/style.py @@ -7,15 +7,14 @@ class Render: - params = [[12, 24, 36], [12, 120]] param_names = ["cols", "rows"] def setup(self, cols, rows): self.df = DataFrame( np.random.randn(rows, cols), - columns=[f"float_{i+1}" for i in range(cols)], - index=[f"row_{i+1}" for i in range(rows)], + columns=[f"float_{i + 1}" for i in range(cols)], + index=[f"row_{i + 1}" for i in range(rows)], ) def time_apply_render(self, cols, rows): @@ -67,7 +66,7 @@ def _apply_func(s): self.st = self.df.style.apply(_apply_func, axis=1) def _style_classes(self): - classes = self.df.applymap(lambda v: ("cls-1" if v > 0 else "")) + classes = self.df.map(lambda v: ("cls-1" if v > 0 else "")) classes.index, classes.columns = self.df.index, self.df.columns self.st = self.df.style.set_td_classes(classes) @@ -77,17 +76,18 @@ def _style_format(self): # apply a formatting function # subset is flexible but hinders vectorised solutions self.st = self.df.style.format( - "{:,.3f}", subset=IndexSlice["row_1":f"row_{ir}", "float_1":f"float_{ic}"] + "{:,.3f}", + subset=IndexSlice["row_1" : f"row_{ir}", "float_1" : f"float_{ic}"], ) def _style_apply_format_hide(self): - self.st = self.df.style.applymap(lambda v: "color: red;") + self.st = self.df.style.map(lambda v: "color: red;") self.st.format("{:.3f}") - self.st.hide_index(self.st.index[1:]) - self.st.hide_columns(self.st.columns[1:]) + self.st.hide(self.st.index[1:], axis=0) + self.st.hide(self.st.columns[1:], axis=1) def _style_tooltips(self): ttips = DataFrame("abc", index=self.df.index[::2], columns=self.df.columns[::2]) self.st = self.df.style.set_tooltips(ttips) - self.st.hide_index(self.st.index[12:]) - self.st.hide_columns(self.st.columns[12:]) + self.st.hide(self.st.index[12:], axis=0) + self.st.hide(self.st.columns[12:], axis=1) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index e3c6bf9bd4e07..a6c6990892d38 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -4,50 +4,29 @@ from pandas import ( DataFrame, + Index, MultiIndex, Series, + array, concat, date_range, merge, merge_asof, ) -from .pandas_vb_common import tm - try: from pandas import merge_ordered except ImportError: from pandas import ordered_merge as merge_ordered -class Append: - def setup(self): - self.df1 = DataFrame(np.random.randn(10000, 4), columns=["A", "B", "C", "D"]) - self.df2 = self.df1.copy() - self.df2.index = np.arange(10000, 20000) - self.mdf1 = self.df1.copy() - self.mdf1["obj1"] = "bar" - self.mdf1["obj2"] = "bar" - self.mdf1["int1"] = 5 - self.mdf1 = self.mdf1._consolidate() - self.mdf2 = self.mdf1.copy() - self.mdf2.index = self.df2.index - - def time_append_homogenous(self): - self.df1.append(self.df2) - - def time_append_mixed(self): - self.mdf1.append(self.mdf2) - - class Concat: - params = [0, 1] param_names = ["axis"] def setup(self, axis): N = 1000 - s = Series(N, index=tm.makeStringIndex(N)) + s = Series(N, index=Index([f"i-{i}" for i in range(N)], dtype=object)) self.series = [s[i:-i] for i in range(1, 10)] * 50 self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 df = DataFrame( @@ -74,7 +53,6 @@ def time_concat_mixed_ndims(self, axis): class ConcatDataFrames: - params = ([0, 1], [True, False]) param_names = ["axis", "ignore_index"] @@ -91,14 +69,59 @@ def time_f_ordered(self, axis, ignore_index): concat(self.frame_f, axis=axis, ignore_index=ignore_index) -class Join: +class ConcatIndexDtype: + params = ( + [ + "datetime64[ns]", + "int64", + "Int64", + "int64[pyarrow]", + "string[python]", + "string[pyarrow]", + ], + ["monotonic", "non_monotonic", "has_na"], + [0, 1], + [True, False], + ) + param_names = ["dtype", "structure", "axis", "sort"] + + def setup(self, dtype, structure, axis, sort): + N = 10_000 + if dtype == "datetime64[ns]": + vals = date_range("1970-01-01", periods=N) + elif dtype in ("int64", "Int64", "int64[pyarrow]"): + vals = np.arange(N, dtype=np.int64) + elif dtype in ("string[python]", "string[pyarrow]"): + vals = Index([f"i-{i}" for i in range(N)], dtype=object) + else: + raise NotImplementedError + + idx = Index(vals, dtype=dtype) + + if structure == "monotonic": + idx = idx.sort_values() + elif structure == "non_monotonic": + idx = idx[::-1] + elif structure == "has_na": + if not idx._can_hold_na: + raise NotImplementedError + idx = Index([None], dtype=dtype).append(idx) + else: + raise NotImplementedError + + self.series = [Series(i, idx[:-i]) for i in range(1, 6)] + + def time_concat_series(self, dtype, structure, axis, sort): + concat(self.series, axis=axis, sort=sort) + +class Join: params = [True, False] param_names = ["sort"] def setup(self, sort): - level1 = tm.makeStringIndex(10).values - level2 = tm.makeStringIndex(1000).values + level1 = Index([f"i-{i}" for i in range(10)], dtype=object).values + level2 = Index([f"i-{i}" for i in range(1000)], dtype=object).values codes1 = np.arange(10).repeat(1000) codes2 = np.tile(np.arange(1000), 10) index2 = MultiIndex(levels=[level1, level2], codes=[codes1, codes2]) @@ -146,18 +169,30 @@ def time_join_dataframes_cross(self, sort): class JoinIndex: def setup(self): - N = 50000 + N = 5000 self.left = DataFrame( - np.random.randint(1, N / 500, (N, 2)), columns=["jim", "joe"] + np.random.randint(1, N / 50, (N, 2)), columns=["jim", "joe"] ) self.right = DataFrame( - np.random.randint(1, N / 500, (N, 2)), columns=["jolie", "jolia"] + np.random.randint(1, N / 50, (N, 2)), columns=["jolie", "jolia"] ).set_index("jolie") def time_left_outer_join_index(self): self.left.join(self.right, on="jim") +class JoinMultiindexSubset: + def setup(self): + N = 100_000 + mi1 = MultiIndex.from_arrays([np.arange(N)] * 4, names=["a", "b", "c", "d"]) + mi2 = MultiIndex.from_arrays([np.arange(N)] * 2, names=["a", "b"]) + self.left = DataFrame({"col1": 1}, index=mi1) + self.right = DataFrame({"col2": 2}, index=mi2) + + def time_join_multiindex_subset(self): + self.left.join(self.right) + + class JoinEmpty: def setup(self): N = 100_000 @@ -175,8 +210,8 @@ class JoinNonUnique: # outer join of non-unique # GH 6329 def setup(self): - date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="T") - daily_dates = date_index.to_period("D").to_timestamp("S", "S") + date_index = date_range("01-Jan-2013", "23-Jan-2013", freq="min") + daily_dates = date_index.to_period("D").to_timestamp("s", "s") self.fracofday = date_index.values - daily_dates.values self.fracofday = self.fracofday.astype("timedelta64[ns]") self.fracofday = self.fracofday.astype(np.float64) / 86_400_000_000_000 @@ -189,14 +224,13 @@ def time_join_non_unique_equal(self): class Merge: - params = [True, False] param_names = ["sort"] def setup(self, sort): N = 10000 - indices = tm.makeStringIndex(N).values - indices2 = tm.makeStringIndex(N).values + indices = Index([f"i-{i}" for i in range(N)], dtype=object).values + indices2 = Index([f"i-{i}" for i in range(N)], dtype=object).values key = np.tile(indices[:8000], 10) key2 = np.tile(indices2[:8000], 10) self.left = DataFrame( @@ -239,8 +273,44 @@ def time_merge_dataframes_cross(self, sort): merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort) -class I8Merge: +class MergeEA: + params = [ + [ + "Int64", + "Int32", + "Int16", + "UInt64", + "UInt32", + "UInt16", + "Float64", + "Float32", + ], + [True, False], + ] + param_names = ["dtype", "monotonic"] + + def setup(self, dtype, monotonic): + N = 10_000 + indices = np.arange(1, N) + key = np.tile(indices[:8000], 10) + self.left = DataFrame( + {"key": Series(key, dtype=dtype), "value": np.random.randn(80000)} + ) + self.right = DataFrame( + { + "key": Series(indices[2000:], dtype=dtype), + "value2": np.random.randn(7999), + } + ) + if monotonic: + self.left = self.left.sort_values("key") + self.right = self.right.sort_values("key") + + def time_merge(self, dtype, monotonic): + merge(self.left, self.right) + +class I8Merge: params = ["inner", "outer", "left", "right"] param_names = ["how"] @@ -258,18 +328,71 @@ def time_i8merge(self, how): merge(self.left, self.right, how=how) +class UniqueMerge: + params = [4_000_000, 1_000_000] + param_names = ["unique_elements"] + + def setup(self, unique_elements): + N = 1_000_000 + self.left = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + self.right = DataFrame({"a": np.random.randint(1, unique_elements, (N,))}) + uniques = self.right.a.drop_duplicates() + self.right["a"] = concat( + [uniques, Series(np.arange(0, -(N - len(uniques)), -1))], ignore_index=True + ) + + def time_unique_merge(self, unique_elements): + merge(self.left, self.right, how="inner") + + +class MergeDatetime: + params = [ + [ + ("ns", "ns"), + ("ms", "ms"), + ("ns", "ms"), + ], + [None, "Europe/Brussels"], + [True, False], + ] + param_names = ["units", "tz", "monotonic"] + + def setup(self, units, tz, monotonic): + unit_left, unit_right = units + N = 10_000 + keys = Series(date_range("2012-01-01", freq="min", periods=N, tz=tz)) + self.left = DataFrame( + { + "key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left), + "value1": np.random.randn(N * 10), + } + ) + self.right = DataFrame( + { + "key": keys[:8000].dt.as_unit(unit_right), + "value2": np.random.randn(8000), + } + ) + if monotonic: + self.left = self.left.sort_values("key") + self.right = self.right.sort_values("key") + + def time_merge(self, units, tz, monotonic): + merge(self.left, self.right) + + class MergeCategoricals: def setup(self): self.left_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Y": np.random.choice(["one", "two", "three"], size=(10000,)), } ) self.right_object = DataFrame( { - "X": np.random.choice(range(0, 10), size=(10000,)), + "X": np.random.choice(range(10), size=(10000,)), "Z": np.random.choice(["jjj", "kkk", "sss"], size=(10000,)), } ) @@ -302,7 +425,7 @@ def time_merge_on_cat_idx(self): class MergeOrdered: def setup(self): - groups = tm.makeStringIndex(10).values + groups = Index([f"i-{i}" for i in range(10)], dtype=object).values self.left = DataFrame( { "group": groups.repeat(5000), @@ -411,6 +534,42 @@ def time_multiby(self, direction, tolerance): ) +class MergeMultiIndex: + params = [ + [ + ("int64", "int64"), + ("datetime64[ns]", "int64"), + ("Int64", "Int64"), + ], + ["left", "right", "inner", "outer"], + ] + param_names = ["dtypes", "how"] + + def setup(self, dtypes, how): + n = 100_000 + offset = 50_000 + mi1 = MultiIndex.from_arrays( + [ + array(np.arange(n), dtype=dtypes[0]), + array(np.arange(n), dtype=dtypes[1]), + ] + ) + mi2 = MultiIndex.from_arrays( + [ + array(np.arange(offset, n + offset), dtype=dtypes[0]), + array(np.arange(offset, n + offset), dtype=dtypes[1]), + ] + ) + self.df1 = DataFrame({"col1": 1}, index=mi1) + self.df2 = DataFrame({"col2": 2}, index=mi2) + + def time_merge_sorted_multiindex(self, dtypes, how): + # copy to avoid MultiIndex._values caching + df1 = self.df1.copy() + df2 = self.df2.copy() + merge(df1, df2, how=how, left_index=True, right_index=True) + + class Align: def setup(self): size = 5 * 10**5 diff --git a/asv_bench/benchmarks/libs.py b/asv_bench/benchmarks/libs.py index f041499c9c622..7da2d27d98dbb 100644 --- a/asv_bench/benchmarks/libs.py +++ b/asv_bench/benchmarks/libs.py @@ -5,6 +5,7 @@ If a PR does not edit anything in _libs/, then it is unlikely that the benchmarks will be affected. """ + import numpy as np from pandas._libs.lib import ( @@ -15,13 +16,11 @@ from pandas import ( NA, + Index, NaT, ) -from .pandas_vb_common import ( - lib, - tm, -) +from .pandas_vb_common import lib try: from pandas.util import cache_readonly @@ -61,8 +60,8 @@ class FastZip: def setup(self): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) col_array = np.vstack([key1, key2, np.random.randn(N * K)]) col_array2 = col_array.copy() col_array2[:, :10000] = np.nan diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index a498c6b2e4944..0a588c9a2e22e 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -3,14 +3,16 @@ import numpy as np from pandas import ( + NA, DataFrame, + Index, MultiIndex, RangeIndex, + Series, + array, date_range, ) -from .pandas_vb_common import tm - class GetLoc: def setup(self): @@ -141,7 +143,11 @@ def time_is_monotonic(self): class Duplicated: def setup(self): n, k = 200, 5000 - levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] + levels = [ + np.arange(n), + Index([f"i-{i}" for i in range(n)], dtype=object).values, + 1000 + np.arange(n), + ] codes = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, codes=codes) @@ -174,9 +180,21 @@ def time_sortlevel_one(self): self.mi.sortlevel(1) +class SortValues: + params = ["int64", "Int64"] + param_names = ["dtype"] + + def setup(self, dtype): + a = array(np.tile(np.arange(100), 1000), dtype=dtype) + b = array(np.tile(np.arange(1000), 100), dtype=dtype) + self.mi = MultiIndex.from_arrays([a, b]) + + def time_sort_values(self, dtype): + self.mi.sort_values() + + class Values: def setup_cache(self): - level1 = range(1000) level2 = date_range(start="1/1/2012", periods=100) mi = MultiIndex.from_product([level1, level2]) @@ -191,7 +209,6 @@ def time_datetime_level_values_sliced(self, mi): class CategoricalLevel: def setup(self): - self.df = DataFrame( { "a": np.arange(1_000_000, dtype=np.int32), @@ -206,26 +223,32 @@ def time_categorical_level(self): class Equals: def setup(self): - idx_large_fast = RangeIndex(100000) - idx_small_slow = date_range(start="1/1/2012", periods=1) - self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) - + self.mi = MultiIndex.from_product( + [ + date_range("2000-01-01", periods=1000), + RangeIndex(1000), + ] + ) + self.mi_deepcopy = self.mi.copy(deep=True) self.idx_non_object = RangeIndex(1) + def time_equals_deepcopy(self): + self.mi.equals(self.mi_deepcopy) + def time_equals_non_object_index(self): - self.mi_large_slow.equals(self.idx_non_object) + self.mi.equals(self.idx_non_object) class SetOperations: - params = [ ("monotonic", "non_monotonic"), - ("datetime", "int", "string"), + ("datetime", "int", "string", "ea_int"), ("intersection", "union", "symmetric_difference"), + (False, None), ] - param_names = ["index_structure", "dtype", "method"] + param_names = ["index_structure", "dtype", "method", "sort"] - def setup(self, index_structure, dtype, method): + def setup(self, index_structure, dtype, method, sort): N = 10**5 level1 = range(1000) @@ -235,13 +258,17 @@ def setup(self, index_structure, dtype, method): level2 = range(N // 1000) int_left = MultiIndex.from_product([level1, level2]) - level2 = tm.makeStringIndex(N // 1000).values + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values str_left = MultiIndex.from_product([level1, level2]) + level2 = range(N // 1000) + ea_int_left = MultiIndex.from_product([level1, Series(level2, dtype="Int64")]) + data = { "datetime": dates_left, "int": int_left, "string": str_left, + "ea_int": ea_int_left, } if index_structure == "non_monotonic": @@ -251,8 +278,157 @@ def setup(self, index_structure, dtype, method): self.left = data[dtype]["left"] self.right = data[dtype]["right"] - def time_operation(self, index_structure, dtype, method): - getattr(self.left, method)(self.right) + def time_operation(self, index_structure, dtype, method, sort): + getattr(self.left, method)(self.right, sort=sort) + + +class Difference: + params = [ + ("datetime", "int", "string", "ea_int"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**4 * 2 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_left = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_left = MultiIndex.from_product([level1, level2]) + + level2 = Series(range(N // 1000), dtype="Int64") + level2[0] = NA + ea_int_left = MultiIndex.from_product([level1, level2]) + + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values + str_left = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_left, + "int": int_left, + "ea_int": ea_int_left, + "string": str_left, + } + + data = {k: {"left": mi, "right": mi[:5]} for k, mi in data.items()} + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] + + def time_difference(self, dtype): + self.left.difference(self.right) + + +class Unique: + params = [ + (("Int64", NA), ("int64", 0)), + ] + param_names = ["dtype_val"] + + def setup(self, dtype_val): + level = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(1_000_000)), + dtype=dtype_val[0], + ) + self.midx = MultiIndex.from_arrays([level, level]) + + level_dups = Series( + [1, 2, dtype_val[1], dtype_val[1]] + list(range(500_000)) * 2, + dtype=dtype_val[0], + ) + + self.midx_dups = MultiIndex.from_arrays([level_dups, level_dups]) + + def time_unique(self, dtype_val): + self.midx.unique() + + def time_unique_dups(self, dtype_val): + self.midx_dups.unique() + + +class Isin: + params = [ + ("string", "int", "datetime"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**5 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_midx = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_midx = MultiIndex.from_product([level1, level2]) + + level2 = Index([f"i-{i}" for i in range(N // 1000)], dtype=object).values + str_midx = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_midx, + "int": int_midx, + "string": str_midx, + } + + self.midx = data[dtype] + self.values_small = self.midx[:100] + self.values_large = self.midx[100:] + + def time_isin_small(self, dtype): + self.midx.isin(self.values_small) + + def time_isin_large(self, dtype): + self.midx.isin(self.values_large) + + +class Putmask: + def setup(self): + N = 10**5 + level1 = range(1_000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + self.midx = MultiIndex.from_product([level1, level2]) + + level1 = range(1_000, 2_000) + self.midx_values = MultiIndex.from_product([level1, level2]) + + level2 = date_range(start="1/1/2010", periods=N // 1000) + self.midx_values_different = MultiIndex.from_product([level1, level2]) + self.mask = np.array([True, False] * (N // 2)) + + def time_putmask(self): + self.midx.putmask(self.mask, self.midx_values) + + def time_putmask_all_different(self): + self.midx.putmask(self.mask, self.midx_values_different) + + +class Append: + params = ["datetime64[ns]", "int64", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + N1 = 1000 + N2 = 500 + left_level1 = range(N1) + right_level1 = range(N1, N1 + N1) + + if dtype == "datetime64[ns]": + level2 = date_range(start="2000-01-01", periods=N2) + elif dtype == "int64": + level2 = range(N2) + elif dtype == "string": + level2 = Index([f"i-{i}" for i in range(N2)], dtype=object) + else: + raise NotImplementedError + + self.left = MultiIndex.from_product([left_level1, level2]) + self.right = MultiIndex.from_product([right_level1, level2]) + + def time_append(self, dtype): + self.left.append(self.right) from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 34fe4929a752b..f8b51a523dab8 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -1,6 +1,7 @@ """ Benchmarks for pandas at the package-level. """ + import subprocess import sys @@ -11,7 +12,7 @@ def time_import(self): # measurement of the import time we actually care about, # without the subprocess or interpreter overhead cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] - p = subprocess.run(cmd, stderr=subprocess.PIPE) + p = subprocess.run(cmd, stderr=subprocess.PIPE, check=True) line = p.stderr.splitlines()[-1] field = line.split(b"|")[-2].strip() diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index d3168bde0a783..4bd56ccb1b5ce 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -17,7 +17,7 @@ try: import pandas._testing as tm except ImportError: - import pandas.util.testing as tm # noqa:F401 + import pandas.util.testing as tm # noqa: F401 numeric_dtypes = [ @@ -70,7 +70,7 @@ class BaseIO: def remove(self, f): """Remove created files""" try: - os.remove(f) # noqa: PDF008 + os.remove(f) except OSError: # On Windows, attempting to remove a file that is in use # causes an exception to be raised diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 4f81aee62c202..3b8b60e790380 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -2,6 +2,7 @@ Period benchmarks with non-tslibs dependencies. See benchmarks.tslibs.period for benchmarks that rely only on tslibs. """ + from pandas import ( DataFrame, Period, @@ -15,7 +16,6 @@ class PeriodIndexConstructor: - params = [["D"], [True, False]] param_names = ["freq", "is_offset"] @@ -46,7 +46,7 @@ def time_from_ints_daily(self, freq, is_offset): class DataFramePeriodColumn: def setup(self): - self.rng = period_range(start="1/1/1990", freq="S", periods=20000) + self.rng = period_range(start="1/1/1990", freq="s", periods=20000) self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): @@ -59,7 +59,6 @@ def time_set_index(self): class Algorithms: - params = ["index", "series"] param_names = ["typ"] diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 29d2831be1522..3d22bfce7e2b2 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -9,8 +9,6 @@ period_range, ) -from .pandas_vb_common import tm - class Reindex: def setup(self): @@ -23,8 +21,8 @@ def setup(self): ) N = 5000 K = 200 - level1 = tm.makeStringIndex(N).values.repeat(K) - level2 = np.tile(tm.makeStringIndex(K).values, N) + level1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + level2 = np.tile(Index([f"i-{i}" for i in range(K)], dtype=object).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] @@ -54,7 +52,6 @@ def time_reindex_multiindex_no_cache_dates(self): class ReindexMethod: - params = [["pad", "backfill"], [date_range, period_range]] param_names = ["method", "constructor"] @@ -67,25 +64,6 @@ def time_reindex_method(self, method, constructor): self.ts.reindex(self.idx, method=method) -class Fillna: - - params = ["pad", "backfill"] - param_names = ["method"] - - def setup(self, method): - N = 100000 - self.idx = date_range("1/1/2000", periods=N, freq="1min") - ts = Series(np.random.randn(N), index=self.idx)[::2] - self.ts_reindexed = ts.reindex(self.idx) - self.ts_float32 = self.ts_reindexed.astype("float32") - - def time_reindexed(self, method): - self.ts_reindexed.fillna(method=method) - - def time_float_32(self, method): - self.ts_float32.fillna(method=method) - - class LevelAlign: def setup(self): self.index = MultiIndex( @@ -107,15 +85,14 @@ def time_reindex_level(self): class DropDuplicates: - params = [True, False] param_names = ["inplace"] def setup(self, inplace): N = 10000 K = 10 - key1 = tm.makeStringIndex(N).values.repeat(K) - key2 = tm.makeStringIndex(N).values.repeat(K) + key1 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) + key2 = Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(K) self.df = DataFrame( {"key1": key1, "key2": key2, "value": np.random.randn(N * K)} ) @@ -123,7 +100,9 @@ def setup(self, inplace): self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) + self.s_str = Series( + np.tile(Index([f"i-{i}" for i in range(1000)], dtype=object).values, 10) + ) N = 1000000 K = 10000 @@ -154,7 +133,7 @@ class Align: # blog "pandas escaped the zoo" def setup(self): n = 50000 - indices = tm.makeStringIndex(n) + indices = Index([f"i-{i}" for i in range(n)], dtype=object) subsample_size = 40000 self.x = Series(np.random.randn(n), indices) self.y = Series( diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 8d4fc0240f2cc..a9276b7dc32ce 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -4,7 +4,6 @@ class FillNa: - params = [True, False] param_names = ["inplace"] @@ -23,7 +22,6 @@ def time_replace(self, inplace): class ReplaceDict: - params = [True, False] param_names = ["inplace"] @@ -44,7 +42,7 @@ class ReplaceList: param_names = ["inplace"] def setup(self, inplace): - self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10**7)) + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(10**7)) def time_replace_list(self, inplace): self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) @@ -55,7 +53,6 @@ def time_replace_list_one_match(self, inplace): class Convert: - params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) param_names = ["constructor", "replace_data"] diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 89c627865049e..54326a4433756 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -15,12 +15,17 @@ class Melt: - def setup(self): - self.df = DataFrame(np.random.randn(10000, 3), columns=["A", "B", "C"]) - self.df["id1"] = np.random.randint(0, 10, 10000) - self.df["id2"] = np.random.randint(100, 1000, 10000) + params = ["float64", "Float64"] + param_names = ["dtype"] - def time_melt_dataframe(self): + def setup(self, dtype): + self.df = DataFrame( + np.random.randn(100_000, 3), columns=["A", "B", "C"], dtype=dtype + ) + self.df["id1"] = pd.Series(np.random.randint(0, 10, 10000)) + self.df["id2"] = pd.Series(np.random.randint(100, 1000, 10000)) + + def time_melt_dataframe(self, dtype): melt(self.df, id_vars=["id1", "id2"]) @@ -36,7 +41,7 @@ def setup(self): self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.df.pivot("date", "variable", "value") + self.df.pivot(index="date", columns="variable", values="value") class SimpleReshape: @@ -54,7 +59,6 @@ def time_unstack(self): class ReshapeExtensionDtype: - params = ["datetime64[ns, US/Pacific]", "Period[s]"] param_names = ["dtype"] @@ -89,8 +93,26 @@ def time_transpose(self, dtype): self.df.T -class Unstack: +class ReshapeMaskedArrayDtype(ReshapeExtensionDtype): + params = ["Int64", "Float64"] + param_names = ["dtype"] + def setup(self, dtype): + lev = pd.Index(list("ABCDEFGHIJ")) + ri = pd.Index(range(1000)) + mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"]) + + values = np.random.randn(10_000).astype(int) + + ser = pd.Series(values, dtype=dtype, index=mi) + df = ser.unstack("bar") + # roundtrips -> df.stack().equals(ser) + + self.ser = ser + self.df = df + + +class Unstack: params = ["int", "category"] def setup(self, dtype): @@ -112,9 +134,7 @@ def setup(self, dtype): values = np.take(list(string.ascii_letters), indices) values = [pd.Categorical(v) for v in values.T] - self.df = DataFrame( - {i: cat for i, cat in enumerate(values)}, index, columns - ) + self.df = DataFrame(dict(enumerate(values)), index, columns) self.df2 = self.df.iloc[:-1] @@ -198,7 +218,7 @@ def time_pivot_table_margins(self): def time_pivot_table_categorical(self): self.df2.pivot_table( - index="col1", values="col3", columns="col2", aggfunc=np.sum, fill_value=0 + index="col1", values="col3", columns="col2", aggfunc="sum", fill_value=0 ) def time_pivot_table_categorical_observed(self): @@ -206,13 +226,13 @@ def time_pivot_table_categorical_observed(self): index="col1", values="col3", columns="col2", - aggfunc=np.sum, + aggfunc="sum", fill_value=0, observed=True, ) def time_pivot_table_margins_only_column(self): - self.df.pivot_table(columns=["key2", "key3"], margins=True) + self.df.pivot_table(columns=["key1", "key2", "key3"], margins=True) class Crosstab: @@ -268,9 +288,7 @@ def setup(self, bins): self.datetime_series = pd.Series( np.random.randint(N, size=N), dtype="datetime64[ns]" ) - self.interval_bins = pd.IntervalIndex.from_breaks( - np.linspace(0, N, bins), "right" - ) + self.interval_bins = pd.IntervalIndex.from_breaks(np.linspace(0, N, bins)) def time_cut_int(self, bins): pd.cut(self.int_series, bins) @@ -310,7 +328,6 @@ class Explode: params = [[100, 1000, 10000], [3, 5, 10]] def setup(self, n_rows, max_list_length): - data = [np.arange(np.random.randint(max_list_length)) for _ in range(n_rows)] self.series = pd.Series(data) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index d65a1a39e8bc7..f9a5f38c2e349 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -6,12 +6,23 @@ class Methods: - params = ( ["DataFrame", "Series"], [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})], ["int", "float"], - ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"], + [ + "median", + "mean", + "max", + "min", + "std", + "count", + "skew", + "kurt", + "sum", + "sem", + "nunique", + ], ) param_names = ["constructor", "window_kwargs", "dtype", "method"] @@ -129,7 +140,6 @@ def test_method(self, constructor, dtype, window_kwargs, function, parallel, col class EWMMethods: - params = ( ["DataFrame", "Series"], [ @@ -177,7 +187,6 @@ def setup(self, constructor, window, dtype, method): class Pairwise: - params = ( [({"window": 10}, "rolling"), ({"window": 1000}, "rolling"), ({}, "expanding")], ["corr", "cov"], @@ -251,7 +260,6 @@ def time_rank(self, constructor, window, dtype, percentile, ascending, method): class PeakMemFixedWindowMinMax: - params = ["min", "max"] def setup(self, operation): @@ -287,12 +295,11 @@ def peakmem_rolling(self, constructor, window_size, dtype, method): class Groupby: - params = ( ["sum", "median", "mean", "max", "min", "kurt", "sum"], [ ("rolling", {"window": 2}), - ("rolling", {"window": "30s", "on": "C"}), + ("rolling", {"window": "30s"}), ("expanding", {}), ], ) @@ -304,9 +311,10 @@ def setup(self, method, window_kwargs): { "A": [str(i) for i in range(N)] * 10, "B": list(range(N)) * 10, - "C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10), } ) + if isinstance(kwargs.get("window", None), str): + df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10) self.groupby_window = getattr(df.groupby("A"), window)(**kwargs) def time_method(self, method, window_kwargs): @@ -328,7 +336,6 @@ def time_rolling_multiindex_creation(self): class GroupbyEWM: - params = ["var", "std", "cov", "corr"] param_names = ["method"] @@ -341,7 +348,6 @@ def time_groupby_method(self, method): class GroupbyEWMEngine: - params = ["cython", "numba"] param_names = ["engine"] @@ -358,7 +364,6 @@ def table_method_func(x): class TableMethod: - params = ["single", "table"] param_names = ["method"] diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 1d3bd4a357d24..85d34cac5a7bf 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,14 +3,13 @@ import numpy as np from pandas import ( + NA, Index, NaT, Series, date_range, ) -from .pandas_vb_common import tm - class SeriesConstructor: def setup(self): @@ -27,9 +26,6 @@ def time_constructor_dict(self): def time_constructor_no_data(self): Series(data=None, index=self.idx) - def time_constructor_fastpath(self): - Series(self.array, index=self.idx2, name="name", fastpath=True) - class ToFrame: params = [["int64", "datetime64[ns]", "category", "Int64"], [None, "foo"]] @@ -45,7 +41,6 @@ def time_to_frame(self, dtype, name): class NSort: - params = ["first", "last", "all"] param_names = ["keep"] @@ -60,7 +55,6 @@ def time_nsmallest(self, keep): class Dropna: - params = ["int", "datetime"] param_names = ["dtype"] @@ -68,7 +62,7 @@ def setup(self, dtype): N = 10**6 data = { "int": np.random.randint(1, 10, N), - "datetime": date_range("2000-01-01", freq="S", periods=N), + "datetime": date_range("2000-01-01", freq="s", periods=N), } self.s = Series(data[dtype]) if dtype == "datetime": @@ -78,8 +72,54 @@ def time_dropna(self, dtype): self.s.dropna() -class SearchSorted: +class Fillna: + params = [ + [ + "datetime64[ns]", + "float32", + "float64", + "Float64", + "Int64", + "int64[pyarrow]", + "string", + "string[pyarrow]", + ], + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**6 + if dtype == "datetime64[ns]": + data = date_range("2000-01-01", freq="s", periods=N) + na_value = NaT + elif dtype in ("float64", "Float64"): + data = np.random.randn(N) + na_value = np.nan + elif dtype in ("Int64", "int64[pyarrow]"): + data = np.arange(N) + na_value = NA + elif dtype in ("string", "string[pyarrow]"): + data = np.array([str(i) * 5 for i in range(N)], dtype=object) + na_value = NA + else: + raise NotImplementedError + fill_value = data[0] + ser = Series(data, dtype=dtype) + ser[::2] = na_value + self.ser = ser + self.fill_value = fill_value + + def time_fillna(self, dtype): + self.ser.fillna(value=self.fill_value) + + def time_ffill(self, dtype): + self.ser.ffill() + def time_bfill(self, dtype): + self.ser.bfill() + + +class SearchSorted: goal_time = 0.2 params = [ "int8", @@ -108,11 +148,14 @@ def time_searchsorted(self, dtype): class Map: - - params = (["dict", "Series", "lambda"], ["object", "category", "int"]) - param_names = "mapper" - - def setup(self, mapper, dtype): + params = ( + ["dict", "Series", "lambda"], + ["object", "category", "int"], + [None, "ignore"], + ) + param_names = ["mapper", "dtype", "na_action"] + + def setup(self, mapper, dtype, na_action): map_size = 1000 map_data = Series(map_size - np.arange(map_size), dtype=dtype) @@ -129,8 +172,8 @@ def setup(self, mapper, dtype): self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype) - def time_map(self, mapper, *args, **kwargs): - self.s.map(self.map_data) + def time_map(self, mapper, dtype, na_action): + self.s.map(self.map_data, na_action=na_action) class Clip: @@ -144,8 +187,17 @@ def time_clip(self, n): self.s.clip(0, 1) -class ValueCounts: +class ClipDt: + def setup(self): + dr = date_range("20220101", periods=100_000, freq="s", tz="UTC") + self.clipper_dt = dr[0:1_000].repeat(100) + self.s = Series(dr) + + def time_clip(self): + self.s.clip(upper=self.clipper_dt) + +class ValueCounts: params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] param_names = ["N", "dtype"] @@ -156,8 +208,19 @@ def time_value_counts(self, N, dtype): self.s.value_counts() -class ValueCountsObjectDropNAFalse: +class ValueCountsEA: + params = [[10**3, 10**4, 10**5], [True, False]] + param_names = ["N", "dropna"] + + def setup(self, N, dropna): + self.s = Series(np.random.randint(0, N, size=10 * N), dtype="Int64") + self.s.loc[1] = NA + + def time_value_counts(self, N, dropna): + self.s.value_counts(dropna=dropna) + +class ValueCountsObjectDropNAFalse: params = [10**3, 10**4, 10**5] param_names = ["N"] @@ -169,7 +232,6 @@ def time_value_counts(self, N): class Mode: - params = [[10**3, 10**4, 10**5], ["int", "uint", "float", "object"]] param_names = ["N", "dtype"] @@ -181,7 +243,6 @@ def time_mode(self, N, dtype): class ModeObjectDropNAFalse: - params = [10**3, 10**4, 10**5] param_names = ["N"] @@ -194,7 +255,7 @@ def time_mode(self, N): class Dir: def setup(self): - self.s = Series(index=tm.makeStringIndex(10000)) + self.s = Series(index=Index([f"i-{i}" for i in range(10000)], dtype=object)) def time_dir_strings(self): dir(self.s) @@ -210,7 +271,6 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] @@ -223,7 +283,6 @@ def time_all(self, N, case, dtype): class Any: - params = [[10**3, 10**6], ["fast", "slow"], ["bool", "boolean"]] param_names = ["N", "case", "dtype"] @@ -236,7 +295,6 @@ def time_any(self, N, case, dtype): class NanOps: - params = [ [ "var", @@ -261,7 +319,7 @@ def setup(self, func, N, dtype): if func == "argmax" and dtype in {"Int64", "boolean"}: # Skip argmax for nullable int since this doesn't work yet (GH-24382) raise NotImplementedError - self.s = Series([1] * N, dtype=dtype) + self.s = Series(np.ones(N), dtype=dtype) self.func = getattr(self.s, func) def time_func(self, func, N, dtype): @@ -269,7 +327,6 @@ def time_func(self, func, N, dtype): class Rank: - param_names = ["dtype"] params = [ ["int", "uint", "float", "object"], @@ -282,4 +339,87 @@ def time_rank(self, dtype): self.s.rank() +class Iter: + param_names = ["dtype"] + params = [ + "bool", + "boolean", + "int64", + "Int64", + "float64", + "Float64", + "datetime64[ns]", + ] + + def setup(self, dtype): + N = 10**5 + if dtype in ["bool", "boolean"]: + data = np.repeat([True, False], N // 2) + elif dtype in ["int64", "Int64"]: + data = np.arange(N) + elif dtype in ["float64", "Float64"]: + data = np.random.randn(N) + elif dtype == "datetime64[ns]": + data = date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + self.s = Series(data, dtype=dtype) + + def time_iter(self, dtype): + for v in self.s: + pass + + +class ToNumpy: + def setup(self): + N = 1_000_000 + self.ser = Series( + np.random.randn( + N, + ) + ) + + def time_to_numpy(self): + self.ser.to_numpy() + + def time_to_numpy_double_copy(self): + self.ser.to_numpy(dtype="float64", copy=True) + + def time_to_numpy_copy(self): + self.ser.to_numpy(copy=True) + + def time_to_numpy_float_with_nan(self): + self.ser.to_numpy(dtype="float64", na_value=np.nan) + + +class Replace: + param_names = ["num_to_replace"] + params = [100, 1000] + + def setup(self, num_to_replace): + N = 1_000_000 + self.arr = np.random.randn(N) + self.arr1 = self.arr.copy() + np.random.shuffle(self.arr1) + self.ser = Series(self.arr) + + self.to_replace_list = np.random.choice(self.arr, num_to_replace) + self.values_list = np.random.choice(self.arr1, num_to_replace) + + self.replace_dict = dict(zip(self.to_replace_list, self.values_list)) + + def time_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def peakmem_replace_dict(self, num_to_replace): + self.ser.replace(self.replace_dict) + + def time_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + def peakmem_replace_list(self, num_to_replace): + self.ser.replace(self.to_replace_list, self.values_list) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index d871f907232f5..22a5511e4c678 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -22,7 +22,7 @@ class SparseSeriesToFrame: def setup(self): K = 50 N = 50001 - rng = date_range("1/1/2000", periods=N, freq="T") + rng = date_range("1/1/2000", periods=N, freq="min") self.series = {} for i in range(1, K): data = np.random.randn(N)[:-i] @@ -35,7 +35,6 @@ def time_series_to_frame(self): class SparseArrayConstructor: - params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, object]) param_names = ["dense_proportion", "fill_value", "dtype"] @@ -106,7 +105,6 @@ def time_to_coo(self): class Arithmetic: - params = ([0.1, 0.01], [0, np.nan]) param_names = ["dense_proportion", "fill_value"] @@ -131,7 +129,6 @@ def time_divide(self, dense_proportion, fill_value): class ArithmeticBlock: - params = [np.nan, 0] param_names = ["fill_value"] @@ -167,7 +164,6 @@ def time_division(self, fill_value): class MinMax: - params = (["min", "max"], [0.0, np.nan]) param_names = ["func", "fill_value"] @@ -181,7 +177,6 @@ def time_min_max(self, func, fill_value): class Take: - params = ([np.array([0]), np.arange(100_000), np.full(100_000, -1)], [True, False]) param_names = ["indices", "allow_fill"] @@ -210,7 +205,6 @@ def time_slice(self): class GetItemMask: - params = [True, False, np.nan] param_names = ["fill_value"] @@ -219,12 +213,12 @@ def setup(self, fill_value): d = 1e-5 arr = make_array(N, d, np.nan, np.float64) self.sp_arr = SparseArray(arr) - b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool8) + b_arr = np.full(shape=N, fill_value=fill_value, dtype=np.bool_) fv_inds = np.unique( np.random.randint(low=0, high=N - 1, size=int(N * d), dtype=np.int32) ) b_arr[fv_inds] = True if pd.isna(fill_value) else not fill_value - self.sp_b_arr = SparseArray(b_arr, dtype=np.bool8, fill_value=fill_value) + self.sp_b_arr = SparseArray(b_arr, dtype=np.bool_, fill_value=fill_value) def time_mask(self, fill_value): self.sp_arr[self.sp_b_arr] diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 92a78b7c2f63d..8913293dfa20e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -2,18 +2,14 @@ import pandas as pd -ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] +ops = ["mean", "sum", "median", "std", "skew", "kurt", "prod", "sem", "var"] class FrameOps: - - params = [ops, ["float", "int", "Int64"], [0, 1]] + params = [ops, ["float", "int", "Int64"], [0, 1, None]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - if op == "mad" and dtype == "Int64": - # GH-33036, GH#33600 - raise NotImplementedError values = np.random.randn(100000, 4) if dtype == "Int64": values = values.astype(int) @@ -24,12 +20,45 @@ def time_op(self, op, dtype, axis): self.df_func(axis=axis) -class FrameMultiIndexOps: +class FrameMixedDtypesOps: + params = [ops, [0, 1, None]] + param_names = ["op", "axis"] + + def setup(self, op, axis): + if op in ("sum", "skew", "kurt", "prod", "sem", "var") or ( + (op, axis) + in ( + ("mean", 1), + ("mean", None), + ("median", 1), + ("median", None), + ("std", 1), + ("std", None), + ) + ): + # Skipping cases where datetime aggregations are not implemented + raise NotImplementedError + + N = 1_000_000 + df = pd.DataFrame( + { + "f": np.random.normal(0.0, 1.0, N), + "i": np.random.randint(0, N, N), + "ts": pd.date_range(start="1/1/2000", periods=N, freq="h"), + } + ) + + self.df_func = getattr(df, op) + + def time_op(self, op, axis): + self.df_func(axis=axis) - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] - def setup(self, level, op): +class FrameMultiIndexOps: + params = [ops] + param_names = ["op"] + + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -40,12 +69,11 @@ def setup(self, level, op): df = pd.DataFrame(np.random.randn(len(index), 4), index=index) self.df_func = getattr(df, op) - def time_op(self, level, op): - self.df_func(level=level) + def time_op(self, op): + self.df_func() class SeriesOps: - params = [ops, ["float", "int"]] param_names = ["op", "dtype"] @@ -58,11 +86,10 @@ def time_op(self, op, dtype): class SeriesMultiIndexOps: + params = [ops] + param_names = ["op"] - params = ([0, 1, [0, 1]], ops) - param_names = ["level", "op"] - - def setup(self, level, op): + def setup(self, op): levels = [np.arange(10), np.arange(100), np.arange(100)] codes = [ np.arange(10).repeat(10000), @@ -73,12 +100,11 @@ def setup(self, level, op): s = pd.Series(np.random.randn(len(index)), index=index) self.s_func = getattr(s, op) - def time_op(self, level, op): - self.s_func(level=level) + def time_op(self, op): + self.s_func() class Rank: - params = [["DataFrame", "Series"], [True, False]] param_names = ["constructor", "pct"] @@ -94,7 +120,6 @@ def time_average_old(self, constructor, pct): class Correlation: - params = [["spearman", "kendall", "pearson"]] param_names = ["method"] @@ -129,7 +154,6 @@ def time_corrwith_rows(self, method): class Covariance: - params = [] param_names = [] diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index ac1b7f65d2d90..5f2832e361eb5 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -7,58 +7,103 @@ class DatetimeStrftime: timeout = 1500 params = [1000, 10000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = pd.DataFrame( { - "dt": [np.datetime64(dt)] * obs, - "d": [np.datetime64(d)] * obs, - "r": [np.random.uniform()] * obs, + "dt": [np.datetime64(dt)] * nobs, + "d": [np.datetime64(d)] * nobs, + "r": [np.random.uniform()] * nobs, } ) - def time_frame_date_to_str(self, obs): + def time_frame_date_to_str(self, nobs): self.data["d"].astype(str) - def time_frame_date_formatting_default(self, obs): + def time_frame_date_formatting_default(self, nobs): + self.data["d"].dt.strftime(date_format=None) + + def time_frame_date_formatting_default_explicit(self, nobs): self.data["d"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_date_formatting_custom(self, obs): + def time_frame_date_formatting_custom(self, nobs): self.data["d"].dt.strftime(date_format="%Y---%m---%d") - def time_frame_datetime_to_str(self, obs): + def time_frame_datetime_to_str(self, nobs): self.data["dt"].astype(str) - def time_frame_datetime_formatting_default_date_only(self, obs): + def time_frame_datetime_formatting_default(self, nobs): + self.data["dt"].dt.strftime(date_format=None) + + def time_frame_datetime_formatting_default_explicit_date_only(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_datetime_formatting_default(self, obs): + def time_frame_datetime_formatting_default_explicit(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") - def time_frame_datetime_formatting_default_with_float(self, obs): + def time_frame_datetime_formatting_default_with_float(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") - def time_frame_datetime_formatting_custom(self, obs): + def time_frame_datetime_formatting_custom(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") +class PeriodStrftime: + timeout = 1500 + params = ([1000, 10000], ["D", "h"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + self.data = pd.DataFrame( + { + "p": pd.period_range(start="2000-01-01", periods=nobs, freq=freq), + "r": [np.random.uniform()] * nobs, + } + ) + self.data["i"] = self.data["p"] + self.data.set_index("i", inplace=True) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "h": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_to_str(self, nobs, freq): + self.data["p"].astype(str) + + def time_frame_period_formatting_default(self, nobs, freq): + self.data["p"].dt.strftime(date_format=None) + + def time_frame_period_formatting_default_explicit(self, nobs, freq): + self.data["p"].dt.strftime(date_format=self.default_fmt) + + def time_frame_period_formatting_custom(self, nobs, freq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq): + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + + def time_frame_period_formatting_iso8601_strftime_offset(self, nobs, freq): + """Not optimized yet as %z is not supported by `convert_strftime_format`""" + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") + + class BusinessHourStrftime: timeout = 1500 params = [1000, 10000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): self.data = pd.DataFrame( { - "off": [offsets.BusinessHour()] * obs, + "off": [offsets.BusinessHour()] * nobs, } ) - def time_frame_offset_str(self, obs): + def time_frame_offset_str(self, nobs): self.data["off"].apply(str) - def time_frame_offset_repr(self, obs): + def time_frame_offset_repr(self, nobs): self.data["off"].apply(repr) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index eec722c9f167b..467fab857d306 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -6,12 +6,11 @@ NA, Categorical, DataFrame, + Index, Series, ) from pandas.arrays import StringArray -from .pandas_vb_common import tm - class Dtypes: params = ["str", "string[python]", "string[pyarrow]"] @@ -19,47 +18,40 @@ class Dtypes: def setup(self, dtype): try: - self.s = Series(tm.makeStringIndex(10**5), dtype=dtype) - except ImportError: - raise NotImplementedError + self.s = Series( + Index([f"i-{i}" for i in range(10000)], dtype=object)._values, + dtype=dtype, + ) + except ImportError as err: + raise NotImplementedError from err class Construction: - - params = ["str", "string"] - param_names = ["dtype"] - - def setup(self, dtype): - self.series_arr = tm.rands_array(nchars=10, size=10**5) - self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() - - # GH37371. Testing construction of string series/frames from ExtensionArrays - self.series_cat_arr = Categorical(self.series_arr) - self.frame_cat_arr = Categorical(self.frame_arr) - - def time_series_construction(self, dtype): - Series(self.series_arr, dtype=dtype) - - def peakmem_series_construction(self, dtype): - Series(self.series_arr, dtype=dtype) - - def time_frame_construction(self, dtype): - DataFrame(self.frame_arr, dtype=dtype) - - def peakmem_frame_construction(self, dtype): - DataFrame(self.frame_arr, dtype=dtype) - - def time_cat_series_construction(self, dtype): - Series(self.series_cat_arr, dtype=dtype) - - def peakmem_cat_series_construction(self, dtype): - Series(self.series_cat_arr, dtype=dtype) - - def time_cat_frame_construction(self, dtype): - DataFrame(self.frame_cat_arr, dtype=dtype) - - def peakmem_cat_frame_construction(self, dtype): - DataFrame(self.frame_cat_arr, dtype=dtype) + params = ( + ["series", "frame", "categorical_series"], + ["str", "string[python]", "string[pyarrow]"], + ) + param_names = ["pd_type", "dtype"] + pd_mapping = {"series": Series, "frame": DataFrame, "categorical_series": Series} + dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object} + + def setup(self, pd_type, dtype): + series_arr = np.array( + [str(i) * 10 for i in range(100_000)], dtype=self.dtype_mapping[dtype] + ) + if pd_type == "series": + self.arr = series_arr + elif pd_type == "frame": + self.arr = series_arr.reshape((50_000, 2)).copy() + elif pd_type == "categorical_series": + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.arr = Categorical(series_arr) + + def time_construction(self, pd_type, dtype): + self.pd_mapping[pd_type](self.arr, dtype=dtype) + + def peakmem_construction(self, pd_type, dtype): + self.pd_mapping[pd_type](self.arr, dtype=dtype) class Methods(Dtypes): @@ -177,13 +169,12 @@ def time_isupper(self, dtype): class Repeat: - params = ["int", "array"] param_names = ["repeats"] def setup(self, repeats): N = 10**5 - self.s = Series(tm.makeStringIndex(N)) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)) repeat = {"int": 1, "array": np.random.randint(1, 3, N)} self.values = repeat[repeats] @@ -192,20 +183,26 @@ def time_repeat(self, repeats): class Cat: - params = ([0, 3], [None, ","], [None, "-"], [0.0, 0.001, 0.15]) param_names = ["other_cols", "sep", "na_rep", "na_frac"] def setup(self, other_cols, sep, na_rep, na_frac): N = 10**5 mask_gen = lambda: np.random.choice([True, False], N, p=[1 - na_frac, na_frac]) - self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) + self.s = Series(Index([f"i-{i}" for i in range(N)], dtype=object)).where( + mask_gen() + ) if other_cols == 0: # str.cat self-concatenates only for others=None self.others = None else: self.others = DataFrame( - {i: tm.makeStringIndex(N).where(mask_gen()) for i in range(other_cols)} + { + i: Index([f"i-{i}" for i in range(N)], dtype=object).where( + mask_gen() + ) + for i in range(other_cols) + } ) def time_cat(self, other_cols, sep, na_rep, na_frac): @@ -217,7 +214,6 @@ def time_cat(self, other_cols, sep, na_rep, na_frac): class Contains(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "regex"] @@ -229,7 +225,6 @@ def time_contains(self, dtype, regex): class Split(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] @@ -245,7 +240,6 @@ def time_rsplit(self, dtype, expand): class Extract(Dtypes): - params = (Dtypes.params, [True, False]) param_names = ["dtype", "expand"] @@ -260,7 +254,8 @@ def time_extract_single_group(self, dtype, expand): class Dummies(Dtypes): def setup(self, dtype): super().setup(dtype) - self.s = self.s.str.join("|") + N = len(self.s) // 5 + self.s = self.s[:N].str.join("|") def time_get_dummies(self, dtype): self.s.str.get_dummies("|") @@ -268,7 +263,7 @@ def time_get_dummies(self, dtype): class Encode: def setup(self): - self.ser = Series(tm.makeStringIndex()) + self.ser = Series(Index([f"i-{i}" for i in range(10_000)], dtype=object)) def time_encode_decode(self): self.ser.str.encode("utf-8").str.decode("utf-8") @@ -291,7 +286,7 @@ def time_iter(self, dtype): class StringArrayConstruction: def setup(self): - self.series_arr = tm.rands_array(nchars=10, size=10**5) + self.series_arr = np.array([str(i) * 10 for i in range(10**5)], dtype=object) self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)]) def time_string_array_construction(self): diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 9373edadb8e90..8deec502898d9 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -20,7 +20,6 @@ class DatetimeIndex: - params = ["dst", "repeated", "tz_aware", "tz_local", "tz_naive"] param_names = ["index_type"] @@ -28,9 +27,9 @@ def setup(self, index_type): N = 100000 dtidxes = { "dst": date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ), - "repeated": date_range(start="2000", periods=N / 10, freq="s").repeat(10), + "repeated": date_range(start="2000", periods=N // 10, freq="s").repeat(10), "tz_aware": date_range(start="2000", periods=N, freq="s", tz="US/Eastern"), "tz_local": date_range( start="2000", periods=N, freq="s", tz=dateutil.tz.tzlocal() @@ -68,19 +67,18 @@ def time_is_dates_only(self, index_type): class TzLocalize: - params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] param_names = "tz" def setup(self, tz): dst_rng = date_range( - start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="S" + start="10/29/2000 1:00:00", end="10/29/2000 1:59:59", freq="s" ) - self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="S") + self.index = date_range(start="10/29/2000", end="10/29/2000 00:59:59", freq="s") self.index = self.index.append(dst_rng) self.index = self.index.append(dst_rng) self.index = self.index.append( - date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="S") + date_range(start="10/29/2000 2:00:00", end="10/29/2000 3:00:00", freq="s") ) def time_infer_dst(self, tz): @@ -88,12 +86,11 @@ def time_infer_dst(self, tz): class ResetIndex: - params = [None, "US/Eastern"] param_names = "tz" def setup(self, tz): - idx = date_range(start="1/1/2000", periods=1000, freq="H", tz=tz) + idx = date_range(start="1/1/2000", periods=1000, freq="h", tz=tz) self.df = DataFrame(np.random.randn(1000, 2), index=idx) def time_reset_datetimeindex(self, tz): @@ -119,23 +116,22 @@ def time_infer_freq(self, freq): class TimeDatetimeConverter: def setup(self): N = 100000 - self.rng = date_range(start="1/1/2000", periods=N, freq="T") + self.rng = date_range(start="1/1/2000", periods=N, freq="min") def time_convert(self): DatetimeConverter.convert(self.rng, None, None) class Iteration: - params = [date_range, period_range, timedelta_range] param_names = ["time_index"] def setup(self, time_index): N = 10**6 if time_index is timedelta_range: - self.idx = time_index(start=0, freq="T", periods=N) + self.idx = time_index(start=0, freq="min", periods=N) else: - self.idx = time_index(start="20140101", freq="T", periods=N) + self.idx = time_index(start="20140101", freq="min", periods=N) self.exit = 10000 def time_iter(self, time_index): @@ -149,12 +145,11 @@ def time_iter_preexit(self, time_index): class ResampleDataFrame: - params = ["max", "mean", "min"] param_names = ["method"] def setup(self, method): - rng = date_range(start="20130101", periods=100000, freq="50L") + rng = date_range(start="20130101", periods=100000, freq="50ms") df = DataFrame(np.random.randn(100000, 2), index=rng) self.resample = getattr(df.resample("1s"), method) @@ -163,14 +158,13 @@ def time_method(self, method): class ResampleSeries: - params = (["period", "datetime"], ["5min", "1D"], ["mean", "ohlc"]) param_names = ["index", "freq", "method"] def setup(self, index, freq, method): indexes = { - "period": period_range(start="1/1/2000", end="1/1/2001", freq="T"), - "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="T"), + "period": period_range(start="1/1/2000", end="1/1/2001", freq="min"), + "datetime": date_range(start="1/1/2000", end="1/1/2001", freq="min"), } idx = indexes[index] ts = Series(np.random.randn(len(idx)), index=idx) @@ -184,16 +178,15 @@ class ResampleDatetetime64: # GH 7754 def setup(self): rng3 = date_range( - start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000U" + start="2000-01-01 00:00:00", end="2000-01-01 10:00:00", freq="555000us" ) self.dt_ts = Series(5, rng3, dtype="datetime64[ns]") def time_resample(self): - self.dt_ts.resample("1S").last() + self.dt_ts.resample("1s").last() class AsOf: - params = ["DataFrame", "Series"] param_names = ["constructor"] @@ -242,7 +235,6 @@ def time_asof_nan_single(self, constructor): class SortIndex: - params = [True, False] param_names = ["monotonic"] @@ -263,7 +255,7 @@ def time_get_slice(self, monotonic): class Lookup: def setup(self): N = 1500000 - rng = date_range(start="1/1/2000", periods=N, freq="S") + rng = date_range(start="1/1/2000", periods=N, freq="s") self.ts = Series(1, index=rng) self.lookup_val = rng[N // 2] @@ -273,13 +265,12 @@ def time_lookup_and_cleanup(self): class DatetimeAccessor: - params = [None, "US/Eastern", "UTC", dateutil.tz.tzutc()] param_names = "tz" def setup(self, tz): N = 100000 - self.series = Series(date_range(start="1/1/2000", periods=N, freq="T", tz=tz)) + self.series = Series(date_range(start="1/1/2000", periods=N, freq="min", tz=tz)) def time_dt_accessor(self, tz): self.series.dt diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py index 23ae73811204c..fe31879e67a67 100644 --- a/asv_bench/benchmarks/tslibs/fields.py +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -12,17 +12,22 @@ class TimeGetTimedeltaField: params = [ _sizes, - ["days", "seconds", "microseconds", "nanoseconds"], + ["seconds", "microseconds", "nanoseconds"], ] param_names = ["size", "field"] def setup(self, size, field): arr = np.random.randint(0, 10, size=size, dtype="i8") self.i8data = arr + arr = np.random.randint(-86400 * 1_000_000_000, 0, size=size, dtype="i8") + self.i8data_negative = arr def time_get_timedelta_field(self, size, field): get_timedelta_field(self.i8data, field) + def time_get_timedelta_field_negative_td(self, size, field): + get_timedelta_field(self.i8data_negative, field) + class TimeGetDateField: params = [ @@ -72,3 +77,6 @@ def setup(self, size, side, period, freqstr, month_kw): def time_get_start_end_field(self, size, side, period, freqstr, month_kw): get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) + + +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index 978a36e470cbb..55bd3c31c055c 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -2,6 +2,7 @@ offsets benchmarks that rely only on tslibs. See benchmarks.offset for offsets benchmarks that rely on other parts of pandas. """ + from datetime import datetime import numpy as np @@ -45,7 +46,6 @@ class OnOffset: - params = offset_objs param_names = ["offset"] @@ -63,7 +63,6 @@ def time_on_offset(self, offset): class OffestDatetimeArithmetic: - params = offset_objs param_names = ["offset"] @@ -71,11 +70,8 @@ def setup(self, offset): self.date = datetime(2011, 1, 1) self.dt64 = np.datetime64("2011-01-01 09:00Z") - def time_apply(self, offset): - offset.apply(self.date) - - def time_apply_np_dt64(self, offset): - offset.apply(self.dt64) + def time_add_np_dt64(self, offset): + offset + self.dt64 def time_add(self, offset): self.date + offset diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index af10102749627..af3bfac6d3d01 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -25,7 +25,6 @@ class PeriodProperties: - params = ( ["M", "min"], [ @@ -56,12 +55,15 @@ def time_property(self, freq, attr): class PeriodUnaryMethods: - params = ["M", "min"] param_names = ["freq"] def setup(self, freq): self.per = Period("2012-06-01", freq=freq) + if freq == "M": + self.default_fmt = "%Y-%m" + elif freq == "min": + self.default_fmt = "%Y-%m-%d %H:%M" def time_to_timestamp(self, freq): self.per.to_timestamp() @@ -70,7 +72,22 @@ def time_now(self, freq): self.per.now(freq) def time_asfreq(self, freq): - self.per.asfreq("A") + self.per.asfreq("Y") + + def time_str(self, freq): + str(self.per) + + def time_repr(self, freq): + repr(self.per) + + def time_strftime_default(self, freq): + self.per.strftime(None) + + def time_strftime_default_explicit(self, freq): + self.per.strftime(self.default_fmt) + + def time_strftime_custom(self, freq): + self.per.strftime("%b. %d, %Y was a %A") class PeriodConstructor: @@ -134,7 +151,11 @@ def setup(self, size, freq, tz): # tzlocal is cumbersomely slow, so skip to keep runtime in check raise NotImplementedError - arr = np.arange(10, dtype="i8").repeat(size // 10) + # we pick 2**55 because smaller values end up returning + # -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency + # this artificially slows down functions since -1 is also the + # error sentinel + arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10) self.i8values = arr def time_dt64arr_to_periodarr(self, size, freq, tz): diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 44f288c7de216..6317d299379d3 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -17,6 +17,7 @@ df.loc[key] = (val.average, val.stdev) """ + import numpy as np try: diff --git a/asv_bench/benchmarks/tslibs/timedelta.py b/asv_bench/benchmarks/tslibs/timedelta.py index 2daf1861eb80a..9d9689fcfa94b 100644 --- a/asv_bench/benchmarks/tslibs/timedelta.py +++ b/asv_bench/benchmarks/tslibs/timedelta.py @@ -2,6 +2,7 @@ Timedelta benchmarks that rely only on tslibs. See benchmarks.timedeltas for Timedelta benchmarks that rely on other parts of pandas. """ + import datetime import numpy as np @@ -19,7 +20,7 @@ def time_from_int(self): Timedelta(123456789) def time_from_unit(self): - Timedelta(1, unit="d") + Timedelta(1, unit="D") def time_from_components(self): Timedelta( diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index eda9bce89188c..6145966fb6a0e 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,7 +1,10 @@ -from datetime import datetime +from datetime import ( + datetime, + timezone, +) +import zoneinfo import numpy as np -import pytz from pandas import Timestamp @@ -12,7 +15,7 @@ class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, timezone.utc) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -50,62 +53,58 @@ def time_from_pd_timestamp(self): class TimestampProperties: - _freqs = [None, "B"] - params = [_tzs, _freqs] - param_names = ["tz", "freq"] + params = [_tzs] + param_names = ["tz"] - def setup(self, tz, freq): - self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz, freq=freq) + def setup(self, tz): + self.ts = Timestamp("2017-08-25 08:16:14", tzinfo=tz) - def time_tz(self, tz, freq): + def time_tz(self, tz): self.ts.tz - def time_dayofweek(self, tz, freq): + def time_dayofweek(self, tz): self.ts.dayofweek - def time_dayofyear(self, tz, freq): + def time_dayofyear(self, tz): self.ts.dayofyear - def time_week(self, tz, freq): + def time_week(self, tz): self.ts.week - def time_quarter(self, tz, freq): + def time_quarter(self, tz): self.ts.quarter - def time_days_in_month(self, tz, freq): + def time_days_in_month(self, tz): self.ts.days_in_month - def time_freqstr(self, tz, freq): - self.ts.freqstr - - def time_is_month_start(self, tz, freq): + def time_is_month_start(self, tz): self.ts.is_month_start - def time_is_month_end(self, tz, freq): + def time_is_month_end(self, tz): self.ts.is_month_end - def time_is_quarter_start(self, tz, freq): + def time_is_quarter_start(self, tz): self.ts.is_quarter_start - def time_is_quarter_end(self, tz, freq): + def time_is_quarter_end(self, tz): self.ts.is_quarter_end - def time_is_year_start(self, tz, freq): + def time_is_year_start(self, tz): self.ts.is_year_start - def time_is_year_end(self, tz, freq): + def time_is_year_end(self, tz): self.ts.is_year_end - def time_is_leap_year(self, tz, freq): + def time_is_leap_year(self, tz): self.ts.is_leap_year - def time_microsecond(self, tz, freq): + def time_microsecond(self, tz): self.ts.microsecond - def time_month_name(self, tz, freq): + def time_month_name(self, tz): self.ts.month_name() - def time_weekday_name(self, tz, freq): + def time_weekday_name(self, tz): self.ts.day_name() @@ -117,7 +116,7 @@ def setup(self, tz): self.ts = Timestamp("2017-08-25 08:16:14", tz=tz) def time_replace_tz(self, tz): - self.ts.replace(tzinfo=pytz.timezone("US/Eastern")) + self.ts.replace(tzinfo=zoneinfo.ZoneInfo("US/Eastern")) def time_replace_None(self, tz): self.ts.replace(tzinfo=None) @@ -140,16 +139,16 @@ def time_to_julian_date(self, tz): self.ts.to_julian_date() def time_floor(self, tz): - self.ts.floor("5T") + self.ts.floor("5min") def time_ceil(self, tz): - self.ts.ceil("5T") + self.ts.ceil("5min") class TimestampAcrossDst: def setup(self): - dt = datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo + dt = datetime(2016, 3, 27, 1, fold=0) + self.tzinfo = dt.astimezone(zoneinfo.ZoneInfo("Europe/Berlin")).tzinfo self.ts2 = Timestamp(dt) def time_replace_across_dst(self): diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index f93ef1cef841f..885cf48d01743 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -15,17 +15,18 @@ val = %timeit -o tr.time_ints_to_pydatetime(box, size, tz) df.loc[key] = (val.average, val.stdev) """ + from datetime import ( timedelta, timezone, ) +import zoneinfo from dateutil.tz import ( gettz, tzlocal, ) import numpy as np -import pytz try: from pandas._libs.tslibs import ints_to_pydatetime @@ -37,7 +38,7 @@ None, timezone.utc, timezone(timedelta(minutes=60)), - pytz.timezone("US/Pacific"), + zoneinfo.ZoneInfo("US/Pacific"), gettz("Asia/Tokyo"), tzlocal_obj, ] @@ -51,7 +52,7 @@ class TimeIntsToPydatetime: _tzs, ) param_names = ["box", "size", "tz"] - # TODO: fold? freq? + # TODO: fold? def setup(self, box, size, tz): if box == "date" and tz is not None: diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index c6b510efdca69..c87adb5e5d0e9 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,5 +1,6 @@ +from datetime import timezone + import numpy as np -from pytz import UTC from pandas._libs.tslibs.tzconversion import tz_localize_to_utc @@ -41,7 +42,7 @@ def time_tz_convert_from_utc(self, size, tz): # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) if old_sig: - tz_convert_from_utc(self.i8data, UTC, tz) + tz_convert_from_utc(self.i8data, timezone.utc, tz) else: tz_convert_from_utc(self.i8data, tz) diff --git a/ci/condarc.yml b/ci/.condarc similarity index 96% rename from ci/condarc.yml rename to ci/.condarc index 9d750b7102c39..f5fb60b208a9c 100644 --- a/ci/condarc.yml +++ b/ci/.condarc @@ -11,7 +11,7 @@ always_yes: true # The number seconds conda will wait for your client to establish a # connection to a remote url resource. # -remote_connect_timeout_secs: 30.0 +remote_connect_timeout_secs: 30 # remote_max_retries (int) # The maximum number of retries each HTTP connection should attempt. diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 113186c746157..2c32eb4f0c584 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -12,42 +12,34 @@ # $ ./ci/code_checks.sh doctests # run doctests # $ ./ci/code_checks.sh docstrings # validate docstring errors # $ ./ci/code_checks.sh single-docs # check single-page docs build warning-free +# $ ./ci/code_checks.sh notebooks # check execution of documentation notebooks -[[ -z "$1" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "single-docs" ]] || \ - { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings]"; exit 9999; } +set -uo pipefail -BASE_DIR="$(dirname $0)/.." -RET=0 -CHECK=$1 - -function invgrep { - # grep with inverse exist status and formatting for azure-pipelines - # - # This function works exactly as grep, but with opposite exit status: - # - 0 (success) when no patterns are found - # - 1 (fail) when the patterns are found - # - # This is useful for the CI, as we want to fail if one of the patterns - # that we want to avoid is found by grep. - grep -n "$@" | sed "s/^/$INVGREP_PREPEND/" | sed "s/$/$INVGREP_APPEND/" ; EXIT_STATUS=${PIPESTATUS[0]} - return $((! $EXIT_STATUS)) -} - -if [[ "$GITHUB_ACTIONS" == "true" ]]; then - INVGREP_PREPEND="##[error]" +if [[ -v 1 ]]; then + CHECK=$1 +else + # script will fail if it uses an unset variable (i.e. $1 is not provided) + CHECK="" fi +[[ -z "$CHECK" || "$CHECK" == "code" || "$CHECK" == "doctests" || "$CHECK" == "docstrings" || "$CHECK" == "single-docs" || "$CHECK" == "notebooks" ]] || \ + { echo "Unknown command $1. Usage: $0 [code|doctests|docstrings|single-docs|notebooks]"; exit 1; } + +BASE_DIR="$(dirname "$0")/.." +RET=0 + ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then - MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG + MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo "$MSG" python -W error -c " import sys import pandas blocklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', 'lxml', 'matplotlib', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', - 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} + 'tables', 'urllib.request', 'xlrd', 'xlsxwriter'} # GH#28227 for some of these check for top-level modules, while others are # more specific (e.g. urllib.request) @@ -57,38 +49,247 @@ if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) " - RET=$(($RET + $?)) ; echo $MSG "DONE" + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Doctests' ; echo $MSG - # Ignore test_*.py files or else the unit tests will run - python -m pytest --doctest-modules --ignore-glob="**/test_*.py" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Cython Doctests' ; echo $MSG - python -m pytest --doctest-cython pandas/_libs - RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Python and Cython Doctests' ; echo "$MSG" + python -c 'import pandas as pd; pd.test(run_doctests=True)' + RET=$(($RET + $?)) ; echo "$MSG" "DONE" fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06 + MSG='Validate Docstrings' ; echo "$MSG" + "$BASE_DIR"/scripts/validate_docstrings.py \ + --format=actions \ + -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ + -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ + -i "pandas.Period.freq GL08" \ + -i "pandas.Period.ordinal GL08" \ + -i "pandas.Timestamp.max PR02" \ + -i "pandas.Timestamp.min PR02" \ + -i "pandas.Timestamp.resolution PR02" \ + -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ + -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ + -i "pandas.core.resample.Resampler.quantile PR01,PR07" \ + -i "pandas.tseries.offsets.BDay PR02,SA01" \ + -i "pandas.tseries.offsets.BHalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.BHalfYearEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.BQuarterBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.n GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.BQuarterEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.BYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BYearBegin.month GL08" \ + -i "pandas.tseries.offsets.BYearBegin.n GL08" \ + -i "pandas.tseries.offsets.BYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BYearEnd.month GL08" \ + -i "pandas.tseries.offsets.BYearEnd.n GL08" \ + -i "pandas.tseries.offsets.BYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessDay PR02,SA01" \ + -i "pandas.tseries.offsets.BusinessDay.calendar GL08" \ + -i "pandas.tseries.offsets.BusinessDay.holidays GL08" \ + -i "pandas.tseries.offsets.BusinessDay.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessDay.n GL08" \ + -i "pandas.tseries.offsets.BusinessDay.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessDay.weekmask GL08" \ + -i "pandas.tseries.offsets.BusinessHour PR02,SA01" \ + -i "pandas.tseries.offsets.BusinessHour.calendar GL08" \ + -i "pandas.tseries.offsets.BusinessHour.end GL08" \ + -i "pandas.tseries.offsets.BusinessHour.holidays GL08" \ + -i "pandas.tseries.offsets.BusinessHour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessHour.n GL08" \ + -i "pandas.tseries.offsets.BusinessHour.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessHour.start GL08" \ + -i "pandas.tseries.offsets.BusinessHour.weekmask GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.BusinessMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.BusinessMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.CBMonthBegin PR02" \ + -i "pandas.tseries.offsets.CBMonthEnd PR02" \ + -i "pandas.tseries.offsets.CDay PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessDay PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessDay.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.is_on_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessDay.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour PR02,SA01" \ + -i "pandas.tseries.offsets.CustomBusinessHour.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.end GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.start GL08" \ + -i "pandas.tseries.offsets.CustomBusinessHour.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.CustomBusinessMonthEnd.weekmask GL08" \ + -i "pandas.tseries.offsets.DateOffset.is_on_offset GL08" \ + -i "pandas.tseries.offsets.DateOffset.n GL08" \ + -i "pandas.tseries.offsets.DateOffset.normalize GL08" \ + -i "pandas.tseries.offsets.Day.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Day.n GL08" \ + -i "pandas.tseries.offsets.Day.normalize GL08" \ + -i "pandas.tseries.offsets.Easter.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Easter.n GL08" \ + -i "pandas.tseries.offsets.Easter.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253.get_rule_code_suffix GL08" \ + -i "pandas.tseries.offsets.FY5253.get_year_end GL08" \ + -i "pandas.tseries.offsets.FY5253.is_on_offset GL08" \ + -i "pandas.tseries.offsets.FY5253.n GL08" \ + -i "pandas.tseries.offsets.FY5253.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253.rule_code GL08" \ + -i "pandas.tseries.offsets.FY5253.startingMonth GL08" \ + -i "pandas.tseries.offsets.FY5253.variation GL08" \ + -i "pandas.tseries.offsets.FY5253.weekday GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.get_rule_code_suffix GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.get_weeks GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.is_on_offset GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.n GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.normalize GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.qtr_with_extra_week GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.rule_code GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.startingMonth GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.variation GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.weekday GL08" \ + -i "pandas.tseries.offsets.FY5253Quarter.year_has_extra_week GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.n GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.n GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.normalize GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.HalfYearEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Hour.n GL08" \ + -i "pandas.tseries.offsets.Hour.normalize GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.week GL08" \ + -i "pandas.tseries.offsets.LastWeekOfMonth.weekday GL08" \ + -i "pandas.tseries.offsets.Micro.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Micro.n GL08" \ + -i "pandas.tseries.offsets.Micro.normalize GL08" \ + -i "pandas.tseries.offsets.Milli.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Milli.n GL08" \ + -i "pandas.tseries.offsets.Milli.normalize GL08" \ + -i "pandas.tseries.offsets.Minute.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Minute.n GL08" \ + -i "pandas.tseries.offsets.Minute.normalize GL08" \ + -i "pandas.tseries.offsets.MonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.MonthBegin.n GL08" \ + -i "pandas.tseries.offsets.MonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.MonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.MonthEnd.n GL08" \ + -i "pandas.tseries.offsets.MonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.Nano.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Nano.normalize GL08" \ + -i "pandas.tseries.offsets.Nano.n GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.n GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.normalize GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.QuarterBegin.startingMonth GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.n GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.normalize GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.QuarterEnd.startingMonth GL08" \ + -i "pandas.tseries.offsets.Second.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Second.n GL08" \ + -i "pandas.tseries.offsets.Second.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.normalize GL08" \ + -i "pandas.tseries.offsets.SemiMonthEnd.rule_code GL08" \ + -i "pandas.tseries.offsets.Tick GL08" \ + -i "pandas.tseries.offsets.Tick.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Tick.n GL08" \ + -i "pandas.tseries.offsets.Tick.normalize GL08" \ + -i "pandas.tseries.offsets.Week.is_on_offset GL08" \ + -i "pandas.tseries.offsets.Week.n GL08" \ + -i "pandas.tseries.offsets.Week.normalize GL08" \ + -i "pandas.tseries.offsets.Week.weekday GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.week GL08" \ + -i "pandas.tseries.offsets.WeekOfMonth.weekday GL08" \ + -i "pandas.tseries.offsets.YearBegin.is_on_offset GL08" \ + -i "pandas.tseries.offsets.YearBegin.month GL08" \ + -i "pandas.tseries.offsets.YearBegin.n GL08" \ + -i "pandas.tseries.offsets.YearBegin.normalize GL08" \ + -i "pandas.tseries.offsets.YearEnd.is_on_offset GL08" \ + -i "pandas.tseries.offsets.YearEnd.month GL08" \ + -i "pandas.tseries.offsets.YearEnd.n GL08" \ + -i "pandas.tseries.offsets.YearEnd.normalize GL08" \ + -i "pandas.util.hash_pandas_object PR07,SA01" # There should be no backslash in the final line, please keep this comment in the last ignored function + + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + +### DOCUMENTATION NOTEBOOKS ### +if [[ -z "$CHECK" || "$CHECK" == "notebooks" ]]; then + + MSG='Notebooks' ; echo $MSG + jupyter nbconvert --execute "$(find doc/source -name '*.ipynb')" --to notebook RET=$(($RET + $?)) ; echo $MSG "DONE" fi ### SINGLE-PAGE DOCS ### if [[ -z "$CHECK" || "$CHECK" == "single-docs" ]]; then - python doc/make.py --warnings-are-errors --single pandas.Series.value_counts - python doc/make.py --warnings-are-errors --single pandas.Series.str.split - python doc/make.py clean + python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.value_counts + python doc/make.py --warnings-are-errors --no-browser --single pandas.Series.str.split fi exit $RET diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml new file mode 100644 index 0000000000000..c7c72828db481 --- /dev/null +++ b/ci/deps/actions-310-minimum_versions.yaml @@ -0,0 +1,65 @@ +# Minimum version of required + optional dependencies +# Aligned with getting_started/install.rst and compat/_optional.py +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.10 + + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 + - boto3 + + # required dependencies + - python-dateutil=2.8.2 + - numpy=1.23.5 + + # optional dependencies + - beautifulsoup4=4.11.2 + - blosc=1.21.3 + - bottleneck=1.3.6 + - fastparquet=2023.10.0 + - fsspec=2022.11.0 + - html5lib=1.1 + - hypothesis=6.84.0 + - gcsfs=2022.11.0 + - jinja2=3.1.2 + - lxml=4.9.2 + - matplotlib=3.6.3 + - numba=0.56.4 + - numexpr=2.8.4 + - odfpy=1.4.1 + - qtpy=2.3.0 + - openpyxl=3.1.0 + - psycopg2=2.9.6 + - pyarrow=10.0.1 + - pymysql=1.0.2 + - pyqt=5.15.9 + - pyreadstat=1.2.0 + - pytables=3.8.0 + - python-calamine=0.1.7 + - pytz=2023.4 + - pyxlsb=1.0.10 + - s3fs=2022.11.0 + - scipy=1.10.0 + - sqlalchemy=2.0.0 + - tabulate=0.9.0 + - xarray=2022.12.0 + - xlrd=2.0.1 + - xlsxwriter=3.0.5 + - zstandard=0.19.0 + + - pip: + - adbc-driver-postgresql==0.10.0 + - adbc-driver-sqlite==0.8.0 + - tzdata==2022.7 diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml deleted file mode 100644 index ef20c2aa889b9..0000000000000 --- a/ci/deps/actions-310-numpydev.yaml +++ /dev/null @@ -1,23 +0,0 @@ -name: pandas-dev -channels: - - defaults -dependencies: - - python=3.10 - - # tools - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 - - pytest-asyncio>=0.17 - - # pandas dependencies - - python-dateutil - - pytz - - pip - - pip: - - "cython" - - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - - "--pre" - - "numpy" - - "scipy" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 65918005ad6f1..74cab4e0970dc 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -4,51 +4,60 @@ channels: dependencies: - python=3.10 + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + # test dependencies - - cython>=0.29.30 - - pytest>=6.0 + - pytest>=7.3.2 - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 - boto3 # required dependencies - python-dateutil - numpy - - pytz # optional dependencies - - beautifulsoup4 - - blosc - - bottleneck - - brotlipy - - fastparquet - - fsspec - - html5lib - - hypothesis - - gcsfs - - jinja2 - - lxml - - matplotlib - - numba - - numexpr - - openpyxl - - odfpy - - pandas-gbq - - psycopg2 - - pymysql - - pytables - - pyarrow - - pyreadstat - - python-snappy - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard + - beautifulsoup4>=4.11.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2023.10.0 + - fsspec>=2022.11.0 + - html5lib>=1.1 + - hypothesis>=6.84.0 + - gcsfs>=2022.11.0 + - jinja2>=3.1.2 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 + - odfpy>=1.4.1 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 + - pymysql>=1.0.2 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pytz>=2023.4 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0, <=2024.9.0 + - xlrd>=2.0.1 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 + + - pip: + - adbc-driver-postgresql>=0.10.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml new file mode 100644 index 0000000000000..092ca18d61259 --- /dev/null +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -0,0 +1,77 @@ +# Non-dependencies that pandas utilizes or has compatibility with pandas objects +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 + - boto3 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - beautifulsoup4>=4.11.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2023.10.0 + - fsspec>=2022.11.0 + - html5lib>=1.1 + - hypothesis>=6.84.0 + - gcsfs>=2022.11.0 + - jinja2>=3.1.2 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 + - odfpy>=1.4.1 + - qtpy>=2.3.0 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 + - pymysql>=1.0.2 + - pyqt>=5.15.9 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pytz>=2023.4 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0, <=2024.9.0 + - xlrd>=2.0.1 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 + + # downstream packages + - botocore + - cftime + - dask + - ipython + - geopandas-base + - seaborn + - scikit-learn + - statsmodels + - coverage + - pandas-datareader + - pyyaml + - py + - pip: + - adbc-driver-postgresql>=0.10.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml new file mode 100644 index 0000000000000..325a6d45d74fd --- /dev/null +++ b/ci/deps/actions-311-numpydev.yaml @@ -0,0 +1,27 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer + - meson=1.2.1 + - meson-python=0.13.1 + - cython>=0.29.33 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 + + # pandas dependencies + - python-dateutil + - pip + + - pip: + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + - "--pre" + - "numpy" + - "tzdata>=2022.7" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml new file mode 100644 index 0000000000000..2d3d11c294e12 --- /dev/null +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -0,0 +1,29 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer + - meson=1.2.1 + - cython>=0.29.33 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 + + # required dependencies + - python-dateutil + - numpy + - pip + + - pip: + - "tzdata>=2022.7" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + - "--prefer-binary" + - "--pre" + - "pyarrow" diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml new file mode 100644 index 0000000000000..b6f515dceaea9 --- /dev/null +++ b/ci/deps/actions-311.yaml @@ -0,0 +1,62 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 + - boto3 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - beautifulsoup4>=4.11.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2023.10.0 + - fsspec>=2022.11.0 + - html5lib>=1.1 + - hypothesis>=6.84.0 + - gcsfs>=2022.11.0 + - jinja2>=3.1.2 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 + - odfpy>=1.4.1 + - qtpy>=2.3.0 + - pyqt>=5.15.9 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 + - pymysql>=1.0.2 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pytz>=2023.4 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0, <=2024.9.0 + - xlrd>=2.0.1 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 + + - pip: + - adbc-driver-postgresql>=0.10.0 + - adbc-driver-sqlite>=0.8.0 diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml new file mode 100644 index 0000000000000..bc66f8a5382c9 --- /dev/null +++ b/ci/deps/actions-312.yaml @@ -0,0 +1,63 @@ +name: pandas-dev-312 +channels: + - conda-forge +dependencies: + - python=3.12 + + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + - pytest-xdist>=3.4.0 + - pytest-localserver>=0.8.1 + - pytest-qt>=4.4.0 + - boto3 + + # required dependencies + - python-dateutil + - numpy + + # optional dependencies + - beautifulsoup4>=4.11.2 + - blosc>=1.21.3 + - bottleneck>=1.3.6 + - fastparquet>=2023.10.0 + - fsspec>=2022.11.0 + - html5lib>=1.1 + - hypothesis>=6.84.0 + - gcsfs>=2022.11.0 + - jinja2>=3.1.2 + - lxml>=4.9.2 + - matplotlib>=3.6.3 + - numba>=0.56.4 + - numexpr>=2.8.4 + - odfpy>=1.4.1 + - qtpy>=2.3.0 + - pyqt>=5.15.9 + - openpyxl>=3.1.0 + - psycopg2>=2.9.6 + - pyarrow>=10.0.1 + - pymysql>=1.0.2 + - pyreadstat>=1.2.0 + - pytables>=3.8.0 + - python-calamine>=0.1.7 + - pytz>=2023.4 + - pyxlsb>=1.0.10 + - s3fs>=2022.11.0 + - scipy>=1.10.0 + - sqlalchemy>=2.0.0 + - tabulate>=0.9.0 + - xarray>=2022.12.0, <=2024.9.0 + - xlrd>=2.0.1 + - xlsxwriter>=3.0.5 + - zstandard>=0.19.0 + + - pip: + - adbc-driver-postgresql>=0.10.0 + - adbc-driver-sqlite>=0.8.0 + - tzdata>=2022.7 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml deleted file mode 100644 index f89d4a743a6f1..0000000000000 --- a/ci/deps/actions-38-downstream_compat.yaml +++ /dev/null @@ -1,71 +0,0 @@ -# Non-dependencies that pandas utilizes or has compatibility with pandas objects -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8 - - # test dependencies - - cython>=0.29.30 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4 - - blosc - - brotlipy - - bottleneck - - fastparquet - - fsspec - - html5lib - - hypothesis - - gcsfs - - jinja2 - - lxml - - matplotlib - - numba - - numexpr - - openpyxl - - odfpy - - pandas-gbq - - psycopg2 - - pyarrow - - pymysql - - pyreadstat - - pytables - - python-snappy - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard - - # downstream packages - - aiobotocore - - botocore - - cftime - - dask - - ipython - - geopandas-base - - seaborn - - scikit-learn - - statsmodels - - coverage - - pandas-datareader - - pyyaml - - py - - pytorch diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml deleted file mode 100644 index a57c7279e2e9b..0000000000000 --- a/ci/deps/actions-38-minimum_versions.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Minimum version of required + optional dependencies -# Aligned with getting_started/install.rst and compat/_optional.py -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8.0 - - # test dependencies - - cython>=0.29.30 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 - - boto3 - - # required dependencies - - python-dateutil=2.8.1 - - numpy=1.20.3 - - pytz=2020.1 - - # optional dependencies - - beautifulsoup4=4.9.3 - - blosc=1.21.0 - - bottleneck=1.3.2 - - brotlipy=0.7.0 - - fastparquet=0.4.0 - - fsspec=2021.05.0 - - html5lib=1.1 - - hypothesis=6.13.0 - - gcsfs=2021.05.0 - - jinja2=3.0.0 - - lxml=4.6.3 - - matplotlib=3.3.2 - - numba=0.53.1 - - numexpr=2.7.3 - - odfpy=1.4.1 - - openpyxl=3.0.7 - - pandas-gbq=0.15.0 - - psycopg2=2.8.6 - - pyarrow=1.0.1 - - pymysql=1.0.2 - - pyreadstat=1.1.2 - - pytables=3.6.1 - - python-snappy=0.6.0 - - pyxlsb=1.0.8 - - s3fs=2021.05.0 - - scipy=1.7.1 - - sqlalchemy=1.4.16 - - tabulate=0.8.9 - - xarray=0.19.0 - - xlrd=2.0.1 - - xlsxwriter=1.4.3 - - xlwt=1.3.0 - - zstandard=0.15.2 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml deleted file mode 100644 index a4473f5911903..0000000000000 --- a/ci/deps/actions-38.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8 - - # test dependencies - - cython>=0.29.30 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4 - - blosc - - bottleneck - - brotlipy - - fastparquet - - fsspec - - html5lib - - hypothesis - - gcsfs - - jinja2 - - lxml - - matplotlib - - numba - - numexpr - - openpyxl - - odfpy - - pandas-gbq - - psycopg2 - - pyarrow - - pymysql - - pyreadstat - - pytables - - python-snappy - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml deleted file mode 100644 index 8605a9f4520d7..0000000000000 --- a/ci/deps/actions-39.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.9 - - # test dependencies - - cython>=0.29.30 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4 - - blosc - - bottleneck - - brotlipy - - fastparquet - - fsspec - - html5lib - - hypothesis - - gcsfs - - jinja2 - - lxml - - matplotlib - - numba - - numexpr - - openpyxl - - odfpy - - pandas-gbq - - psycopg2 - - pymysql - - pyarrow - - pyreadstat - - pytables - - python-snappy - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-39.yaml similarity index 51% rename from ci/deps/actions-pypy-38.yaml rename to ci/deps/actions-pypy-39.yaml index 1a3c73cb4ae2f..90933b24b88db 100644 --- a/ci/deps/actions-pypy-38.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -5,17 +5,22 @@ dependencies: # TODO: Add the rest of the dependencies in here # once the other plentiful failures/segfaults # with base pandas has been dealt with - - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + - python=3.9[build=*_pypy] - # tools - - cython>=0.29.30 - - pytest>=6.0 + # build dependencies + - versioneer + - cython>=0.29.33 + - meson=1.2.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 - pytest-cov - - pytest-asyncio - - pytest-xdist>=1.31 - - hypothesis>=5.5.3 + - pytest-xdist>=3.4.0 + - hypothesis>=6.84.0 # required - numpy - python-dateutil - - pytz + - pip: + - tzdata>=2022.7 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml deleted file mode 100644 index e76b3071bd8bb..0000000000000 --- a/ci/deps/circle-38-arm64.yaml +++ /dev/null @@ -1,55 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8 - - # test dependencies - - cython>=0.29.30 - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - psutil - - pytest-asyncio>=0.17 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4 - - blosc - - bottleneck - - brotlipy - - fastparquet - - fsspec - - html5lib - - hypothesis - - gcsfs - - jinja2 - - lxml - - matplotlib - - numba - - numexpr - - openpyxl - - odfpy - - pandas-gbq - - psycopg2 - - pyarrow - - pymysql - # Not provided on ARM - #- pyreadstat - - pytables - - python-snappy - - pyxlsb - - s3fs - - scipy - - sqlalchemy - - tabulate - - xarray - - xlrd - - xlsxwriter - - xlwt - - zstandard diff --git a/ci/meta.yaml b/ci/meta.yaml new file mode 100644 index 0000000000000..9d434991b12c1 --- /dev/null +++ b/ci/meta.yaml @@ -0,0 +1,92 @@ +{% set version = "2.0.1" %} + +package: + name: pandas + version: {{ version }} + +source: + git_url: ../.. + +build: + number: 1 + script: + - export PYTHONUNBUFFERED=1 # [ppc64le] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . # [not unix] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . --global-option="build_ext" --global-option="-j4" --no-use-pep517 # [unix] + skip: true # [py<39] + +requirements: + build: + - python # [build_platform != target_platform] + - cross-python_{{ target_platform }} # [build_platform != target_platform] + - cython # [build_platform != target_platform] + - numpy # [build_platform != target_platform] + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: + - python + - pip + - setuptools >=61.0.0 + - cython >=0.29.33,<3 + - numpy >=1.21.6 # [py<311] + - numpy >=1.23.2 # [py>=311] + - versioneer + - tomli # [py<311] + run: + - python + - numpy >=1.21.6 # [py<311] + - numpy >=1.23.2 # [py>=311] + - python-dateutil >=2.8.2 + - python-tzdata >=2022.7 + +test: + imports: + - pandas + commands: + - pip check + # Skip test suite on PyPy as it segfaults there + # xref: https://github.com/conda-forge/pandas-feedstock/issues/148 + # + # Also skip `test_rolling_var_numerical_issues` on `ppc64le` as it is a known test failure. + # xref: https://github.com/conda-forge/pandas-feedstock/issues/149 + {% set markers = ["not clipboard", "not single_cpu", "not db", "not network", "not slow"] %} + {% set markers = markers + ["not arm_slow"] %} # [aarch64 or ppc64le] + {% set extra_args = ["-n=2 -m " + " and ".join(markers)] %} + {% set tests_to_skip = "_not_a_real_test" %} + {% set tests_to_skip = tests_to_skip + " or test_rolling_var_numerical_issues" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_std_timedelta64_skipna_false" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_value_counts_normalized[M8[ns]]" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_to_datetime_format_YYYYMMDD_with_nat" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or (TestReductions and test_median_2d)" %} # [ppc64le] + {% set extra_args = extra_args + ["-k", "not (" + tests_to_skip + ")"] %} + - python -c "import pandas; pandas.test(extra_args={{ extra_args }})" # [python_impl == "cpython"] + requires: + - pip + - pytest >=7.3.2 + - pytest-xdist >=3.4.0 + - pytest-cov + - hypothesis >=6.84.0 + - tomli # [py<311] + +about: + home: http://pandas.pydata.org + license: BSD-3-Clause + license_file: LICENSE + summary: Powerful data structures for data analysis, time series, and statistics + doc_url: https://pandas.pydata.org/docs/ + dev_url: https://github.com/pandas-dev/pandas + +extra: + recipe-maintainers: + - jreback + - jorisvandenbossche + - msarahan + - ocefpaf + - TomAugspurger + - WillAyd + - simonjayhawkins + - mroeschke + - datapythonista + - phofl + - lithomas1 + - marcogorelli diff --git a/ci/run_tests.sh b/ci/run_tests.sh index e6de5caf955fc..16292beec612b 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -3,47 +3,16 @@ # Workaround for pytest-xdist (it collects different tests in the workers if PYTHONHASHSEED is not set) # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') +export PYTHONHASHSEED -# May help reproduce flaky CI builds if set in subsequent runs -echo PYTHONHASHSEED=$PYTHONHASHSEED +COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -if [[ "not network" == *"$PATTERN"* ]]; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -if [[ "$COVERAGE" == "true" ]]; then - COVERAGE="-s --cov=pandas --cov-report=xml --cov-append" -else - COVERAGE="" # We need to reset this for COVERAGE="false" case -fi - -# If no X server is found, we use xvfb to emulate it -if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then - export DISPLAY=":0" - XVFB="xvfb-run " -fi - -PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fE -n $PYTEST_WORKERS --dist=worksteal $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" fi -echo $PYTEST_CMD +echo "$PYTEST_CMD" sh -c "$PYTEST_CMD" - -if [[ "$PANDAS_DATA_MANAGER" != "array" && "$PYTEST_TARGET" == "pandas" ]]; then - # The ArrayManager tests should have already been run by PYTEST_CMD if PANDAS_DATA_MANAGER was already set to array - # If we're targeting specific files, e.g. test_downstream.py, don't run. - PYTEST_AM_CMD="PANDAS_DATA_MANAGER=array pytest -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" - - if [[ "$PATTERN" ]]; then - PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"$PATTERN and arraymanager\"" - else - PYTEST_AM_CMD="$PYTEST_AM_CMD -m \"arraymanager\"" - fi - - echo $PYTEST_AM_CMD - sh -c "$PYTEST_AM_CMD" -fi diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh new file mode 100644 index 0000000000000..c7c7ca00ee466 --- /dev/null +++ b/ci/upload_wheels.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Modified from numpy's https://github.com/numpy/numpy/blob/main/tools/wheels/upload_wheels.sh + +set_upload_vars() { + echo "IS_PUSH is $IS_PUSH" + echo "IS_SCHEDULE_DISPATCH is $IS_SCHEDULE_DISPATCH" + if [[ "$IS_PUSH" == "true" ]]; then + echo push and tag event + export ANACONDA_ORG="multibuild-wheels-staging" + export TOKEN="$PANDAS_STAGING_UPLOAD_TOKEN" + export ANACONDA_UPLOAD="true" + elif [[ "$IS_SCHEDULE_DISPATCH" == "true" ]]; then + echo scheduled or dispatched event + export ANACONDA_ORG="scientific-python-nightly-wheels" + export TOKEN="$PANDAS_NIGHTLY_UPLOAD_TOKEN" + export ANACONDA_UPLOAD="true" + else + echo non-dispatch event + export ANACONDA_UPLOAD="false" + fi +} +upload_wheels() { + echo "${PWD}" + if [[ ${ANACONDA_UPLOAD} == true ]]; then + if [ -z "${TOKEN}" ]; then + echo no token set, not uploading + else + # sdists are located under dist folder when built through setup.py + if compgen -G "./dist/*.gz"; then + echo "Found sdist" + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./dist/*.gz + echo "Uploaded sdist" + fi + if compgen -G "./wheelhouse/*.whl"; then + echo "Found wheel" + anaconda -q -t "${TOKEN}" upload --skip -u "${ANACONDA_ORG}" ./wheelhouse/*.whl + echo "Uploaded wheel" + fi + echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" + fi + fi +} diff --git a/doc/_templates/autosummary/class.rst b/doc/_templates/autosummary/class.rst index a9c9bd2b6507f..79c2e37b0192f 100644 --- a/doc/_templates/autosummary/class.rst +++ b/doc/_templates/autosummary/class.rst @@ -1,33 +1,32 @@ -{% extends "!autosummary/class.rst" %} +{{ fullname | escape | underline}} -{% block methods %} -{% if methods %} +.. currentmodule:: {{ module }} -.. - HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. - .. autosummary:: - :toctree: - {% for item in all_methods %} - {%- if not item.startswith('_') or item in ['__call__'] %} - {{ name }}.{{ item }} - {%- endif -%} - {%- endfor %} +.. autoclass:: {{ objname }} -{% endif %} -{% endblock %} + {% block methods %} -{% block attributes %} -{% if attributes %} + {% block attributes %} + {% if attributes %} + .. rubric:: {{ _('Attributes') }} -.. - HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages. .. autosummary:: - :toctree: - {% for item in all_attributes %} - {%- if not item.startswith('_') %} - {{ name }}.{{ item }} - {%- endif -%} - {%- endfor %} + {% for item in attributes %} + {% if item in members and not item.startswith('_') %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} + + {% if methods %} + .. rubric:: {{ _('Methods') }} -{% endif %} -{% endblock %} + .. autosummary:: + {% for item in methods %} + {% if item in members and (not item.startswith('_') or item in ['__call__']) %} + ~{{ name }}.{{ item }} + {% endif %} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/doc/_templates/pandas_footer.html b/doc/_templates/pandas_footer.html new file mode 100644 index 0000000000000..6d8caa4d6c741 --- /dev/null +++ b/doc/_templates/pandas_footer.html @@ -0,0 +1,3 @@ + diff --git a/doc/_templates/sidebar-nav-bs.html b/doc/_templates/sidebar-nav-bs.html index 7e0043e771e72..8298b66568e20 100644 --- a/doc/_templates/sidebar-nav-bs.html +++ b/doc/_templates/sidebar-nav-bs.html @@ -1,9 +1,9 @@ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf new file mode 100644 index 0000000000000..ea356385e9fb1 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx new file mode 100644 index 0000000000000..1b812c1a2595a Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_FA.pptx differ diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md new file mode 100644 index 0000000000000..b8599acff2f6e --- /dev/null +++ b/doc/cheatsheet/README.md @@ -0,0 +1,24 @@ +# Pandas Cheat Sheet + +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF" as the format. + +This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](https://www.princetonoptimization.com/), was inspired by the [RStudio Data Wrangling Cheatsheet](https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf). + +| Topic | Language | PDF | PPT | +|------------------------|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas_Cheat_Sheet | English | | | +| Pandas_Cheat_Sheet_JA | Japanese | | | +| Pandas_Cheat_Sheet_FA | Persian | | | + + + +**Alternative** + +Alternatively, if you want to complement your learning, you can use the Pandas Cheat sheets +developed by [DataCamp](https://www.datacamp.com/) in "PDF", "Google Colab" and "Streamlit" formats. + +| Topic | PDF | Streamlit | Google Colab | +|-------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Pandas | | | Open In Colab | diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt deleted file mode 100644 index c57da38b31777..0000000000000 --- a/doc/cheatsheet/README.txt +++ /dev/null @@ -1,8 +0,0 @@ -The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. -To create the PDF version, within Powerpoint, simply do a "Save As" -and pick "PDF" as the format. - -This cheat sheet was inspired by the RStudio Data Wrangling Cheatsheet[1], written by Irv Lustig, Princeton Consultants[2]. - -[1]: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf -[2]: https://www.princetonoptimization.com/ diff --git a/doc/data/tips.csv b/doc/data/tips.csv deleted file mode 100644 index 856a65a69e647..0000000000000 --- a/doc/data/tips.csv +++ /dev/null @@ -1,245 +0,0 @@ -total_bill,tip,sex,smoker,day,time,size -16.99,1.01,Female,No,Sun,Dinner,2 -10.34,1.66,Male,No,Sun,Dinner,3 -21.01,3.5,Male,No,Sun,Dinner,3 -23.68,3.31,Male,No,Sun,Dinner,2 -24.59,3.61,Female,No,Sun,Dinner,4 -25.29,4.71,Male,No,Sun,Dinner,4 -8.77,2.0,Male,No,Sun,Dinner,2 -26.88,3.12,Male,No,Sun,Dinner,4 -15.04,1.96,Male,No,Sun,Dinner,2 -14.78,3.23,Male,No,Sun,Dinner,2 -10.27,1.71,Male,No,Sun,Dinner,2 -35.26,5.0,Female,No,Sun,Dinner,4 -15.42,1.57,Male,No,Sun,Dinner,2 -18.43,3.0,Male,No,Sun,Dinner,4 -14.83,3.02,Female,No,Sun,Dinner,2 -21.58,3.92,Male,No,Sun,Dinner,2 -10.33,1.67,Female,No,Sun,Dinner,3 -16.29,3.71,Male,No,Sun,Dinner,3 -16.97,3.5,Female,No,Sun,Dinner,3 -20.65,3.35,Male,No,Sat,Dinner,3 -17.92,4.08,Male,No,Sat,Dinner,2 -20.29,2.75,Female,No,Sat,Dinner,2 -15.77,2.23,Female,No,Sat,Dinner,2 -39.42,7.58,Male,No,Sat,Dinner,4 -19.82,3.18,Male,No,Sat,Dinner,2 -17.81,2.34,Male,No,Sat,Dinner,4 -13.37,2.0,Male,No,Sat,Dinner,2 -12.69,2.0,Male,No,Sat,Dinner,2 -21.7,4.3,Male,No,Sat,Dinner,2 -19.65,3.0,Female,No,Sat,Dinner,2 -9.55,1.45,Male,No,Sat,Dinner,2 -18.35,2.5,Male,No,Sat,Dinner,4 -15.06,3.0,Female,No,Sat,Dinner,2 -20.69,2.45,Female,No,Sat,Dinner,4 -17.78,3.27,Male,No,Sat,Dinner,2 -24.06,3.6,Male,No,Sat,Dinner,3 -16.31,2.0,Male,No,Sat,Dinner,3 -16.93,3.07,Female,No,Sat,Dinner,3 -18.69,2.31,Male,No,Sat,Dinner,3 -31.27,5.0,Male,No,Sat,Dinner,3 -16.04,2.24,Male,No,Sat,Dinner,3 -17.46,2.54,Male,No,Sun,Dinner,2 -13.94,3.06,Male,No,Sun,Dinner,2 -9.68,1.32,Male,No,Sun,Dinner,2 -30.4,5.6,Male,No,Sun,Dinner,4 -18.29,3.0,Male,No,Sun,Dinner,2 -22.23,5.0,Male,No,Sun,Dinner,2 -32.4,6.0,Male,No,Sun,Dinner,4 -28.55,2.05,Male,No,Sun,Dinner,3 -18.04,3.0,Male,No,Sun,Dinner,2 -12.54,2.5,Male,No,Sun,Dinner,2 -10.29,2.6,Female,No,Sun,Dinner,2 -34.81,5.2,Female,No,Sun,Dinner,4 -9.94,1.56,Male,No,Sun,Dinner,2 -25.56,4.34,Male,No,Sun,Dinner,4 -19.49,3.51,Male,No,Sun,Dinner,2 -38.01,3.0,Male,Yes,Sat,Dinner,4 -26.41,1.5,Female,No,Sat,Dinner,2 -11.24,1.76,Male,Yes,Sat,Dinner,2 -48.27,6.73,Male,No,Sat,Dinner,4 -20.29,3.21,Male,Yes,Sat,Dinner,2 -13.81,2.0,Male,Yes,Sat,Dinner,2 -11.02,1.98,Male,Yes,Sat,Dinner,2 -18.29,3.76,Male,Yes,Sat,Dinner,4 -17.59,2.64,Male,No,Sat,Dinner,3 -20.08,3.15,Male,No,Sat,Dinner,3 -16.45,2.47,Female,No,Sat,Dinner,2 -3.07,1.0,Female,Yes,Sat,Dinner,1 -20.23,2.01,Male,No,Sat,Dinner,2 -15.01,2.09,Male,Yes,Sat,Dinner,2 -12.02,1.97,Male,No,Sat,Dinner,2 -17.07,3.0,Female,No,Sat,Dinner,3 -26.86,3.14,Female,Yes,Sat,Dinner,2 -25.28,5.0,Female,Yes,Sat,Dinner,2 -14.73,2.2,Female,No,Sat,Dinner,2 -10.51,1.25,Male,No,Sat,Dinner,2 -17.92,3.08,Male,Yes,Sat,Dinner,2 -27.2,4.0,Male,No,Thur,Lunch,4 -22.76,3.0,Male,No,Thur,Lunch,2 -17.29,2.71,Male,No,Thur,Lunch,2 -19.44,3.0,Male,Yes,Thur,Lunch,2 -16.66,3.4,Male,No,Thur,Lunch,2 -10.07,1.83,Female,No,Thur,Lunch,1 -32.68,5.0,Male,Yes,Thur,Lunch,2 -15.98,2.03,Male,No,Thur,Lunch,2 -34.83,5.17,Female,No,Thur,Lunch,4 -13.03,2.0,Male,No,Thur,Lunch,2 -18.28,4.0,Male,No,Thur,Lunch,2 -24.71,5.85,Male,No,Thur,Lunch,2 -21.16,3.0,Male,No,Thur,Lunch,2 -28.97,3.0,Male,Yes,Fri,Dinner,2 -22.49,3.5,Male,No,Fri,Dinner,2 -5.75,1.0,Female,Yes,Fri,Dinner,2 -16.32,4.3,Female,Yes,Fri,Dinner,2 -22.75,3.25,Female,No,Fri,Dinner,2 -40.17,4.73,Male,Yes,Fri,Dinner,4 -27.28,4.0,Male,Yes,Fri,Dinner,2 -12.03,1.5,Male,Yes,Fri,Dinner,2 -21.01,3.0,Male,Yes,Fri,Dinner,2 -12.46,1.5,Male,No,Fri,Dinner,2 -11.35,2.5,Female,Yes,Fri,Dinner,2 -15.38,3.0,Female,Yes,Fri,Dinner,2 -44.3,2.5,Female,Yes,Sat,Dinner,3 -22.42,3.48,Female,Yes,Sat,Dinner,2 -20.92,4.08,Female,No,Sat,Dinner,2 -15.36,1.64,Male,Yes,Sat,Dinner,2 -20.49,4.06,Male,Yes,Sat,Dinner,2 -25.21,4.29,Male,Yes,Sat,Dinner,2 -18.24,3.76,Male,No,Sat,Dinner,2 -14.31,4.0,Female,Yes,Sat,Dinner,2 -14.0,3.0,Male,No,Sat,Dinner,2 -7.25,1.0,Female,No,Sat,Dinner,1 -38.07,4.0,Male,No,Sun,Dinner,3 -23.95,2.55,Male,No,Sun,Dinner,2 -25.71,4.0,Female,No,Sun,Dinner,3 -17.31,3.5,Female,No,Sun,Dinner,2 -29.93,5.07,Male,No,Sun,Dinner,4 -10.65,1.5,Female,No,Thur,Lunch,2 -12.43,1.8,Female,No,Thur,Lunch,2 -24.08,2.92,Female,No,Thur,Lunch,4 -11.69,2.31,Male,No,Thur,Lunch,2 -13.42,1.68,Female,No,Thur,Lunch,2 -14.26,2.5,Male,No,Thur,Lunch,2 -15.95,2.0,Male,No,Thur,Lunch,2 -12.48,2.52,Female,No,Thur,Lunch,2 -29.8,4.2,Female,No,Thur,Lunch,6 -8.52,1.48,Male,No,Thur,Lunch,2 -14.52,2.0,Female,No,Thur,Lunch,2 -11.38,2.0,Female,No,Thur,Lunch,2 -22.82,2.18,Male,No,Thur,Lunch,3 -19.08,1.5,Male,No,Thur,Lunch,2 -20.27,2.83,Female,No,Thur,Lunch,2 -11.17,1.5,Female,No,Thur,Lunch,2 -12.26,2.0,Female,No,Thur,Lunch,2 -18.26,3.25,Female,No,Thur,Lunch,2 -8.51,1.25,Female,No,Thur,Lunch,2 -10.33,2.0,Female,No,Thur,Lunch,2 -14.15,2.0,Female,No,Thur,Lunch,2 -16.0,2.0,Male,Yes,Thur,Lunch,2 -13.16,2.75,Female,No,Thur,Lunch,2 -17.47,3.5,Female,No,Thur,Lunch,2 -34.3,6.7,Male,No,Thur,Lunch,6 -41.19,5.0,Male,No,Thur,Lunch,5 -27.05,5.0,Female,No,Thur,Lunch,6 -16.43,2.3,Female,No,Thur,Lunch,2 -8.35,1.5,Female,No,Thur,Lunch,2 -18.64,1.36,Female,No,Thur,Lunch,3 -11.87,1.63,Female,No,Thur,Lunch,2 -9.78,1.73,Male,No,Thur,Lunch,2 -7.51,2.0,Male,No,Thur,Lunch,2 -14.07,2.5,Male,No,Sun,Dinner,2 -13.13,2.0,Male,No,Sun,Dinner,2 -17.26,2.74,Male,No,Sun,Dinner,3 -24.55,2.0,Male,No,Sun,Dinner,4 -19.77,2.0,Male,No,Sun,Dinner,4 -29.85,5.14,Female,No,Sun,Dinner,5 -48.17,5.0,Male,No,Sun,Dinner,6 -25.0,3.75,Female,No,Sun,Dinner,4 -13.39,2.61,Female,No,Sun,Dinner,2 -16.49,2.0,Male,No,Sun,Dinner,4 -21.5,3.5,Male,No,Sun,Dinner,4 -12.66,2.5,Male,No,Sun,Dinner,2 -16.21,2.0,Female,No,Sun,Dinner,3 -13.81,2.0,Male,No,Sun,Dinner,2 -17.51,3.0,Female,Yes,Sun,Dinner,2 -24.52,3.48,Male,No,Sun,Dinner,3 -20.76,2.24,Male,No,Sun,Dinner,2 -31.71,4.5,Male,No,Sun,Dinner,4 -10.59,1.61,Female,Yes,Sat,Dinner,2 -10.63,2.0,Female,Yes,Sat,Dinner,2 -50.81,10.0,Male,Yes,Sat,Dinner,3 -15.81,3.16,Male,Yes,Sat,Dinner,2 -7.25,5.15,Male,Yes,Sun,Dinner,2 -31.85,3.18,Male,Yes,Sun,Dinner,2 -16.82,4.0,Male,Yes,Sun,Dinner,2 -32.9,3.11,Male,Yes,Sun,Dinner,2 -17.89,2.0,Male,Yes,Sun,Dinner,2 -14.48,2.0,Male,Yes,Sun,Dinner,2 -9.6,4.0,Female,Yes,Sun,Dinner,2 -34.63,3.55,Male,Yes,Sun,Dinner,2 -34.65,3.68,Male,Yes,Sun,Dinner,4 -23.33,5.65,Male,Yes,Sun,Dinner,2 -45.35,3.5,Male,Yes,Sun,Dinner,3 -23.17,6.5,Male,Yes,Sun,Dinner,4 -40.55,3.0,Male,Yes,Sun,Dinner,2 -20.69,5.0,Male,No,Sun,Dinner,5 -20.9,3.5,Female,Yes,Sun,Dinner,3 -30.46,2.0,Male,Yes,Sun,Dinner,5 -18.15,3.5,Female,Yes,Sun,Dinner,3 -23.1,4.0,Male,Yes,Sun,Dinner,3 -15.69,1.5,Male,Yes,Sun,Dinner,2 -19.81,4.19,Female,Yes,Thur,Lunch,2 -28.44,2.56,Male,Yes,Thur,Lunch,2 -15.48,2.02,Male,Yes,Thur,Lunch,2 -16.58,4.0,Male,Yes,Thur,Lunch,2 -7.56,1.44,Male,No,Thur,Lunch,2 -10.34,2.0,Male,Yes,Thur,Lunch,2 -43.11,5.0,Female,Yes,Thur,Lunch,4 -13.0,2.0,Female,Yes,Thur,Lunch,2 -13.51,2.0,Male,Yes,Thur,Lunch,2 -18.71,4.0,Male,Yes,Thur,Lunch,3 -12.74,2.01,Female,Yes,Thur,Lunch,2 -13.0,2.0,Female,Yes,Thur,Lunch,2 -16.4,2.5,Female,Yes,Thur,Lunch,2 -20.53,4.0,Male,Yes,Thur,Lunch,4 -16.47,3.23,Female,Yes,Thur,Lunch,3 -26.59,3.41,Male,Yes,Sat,Dinner,3 -38.73,3.0,Male,Yes,Sat,Dinner,4 -24.27,2.03,Male,Yes,Sat,Dinner,2 -12.76,2.23,Female,Yes,Sat,Dinner,2 -30.06,2.0,Male,Yes,Sat,Dinner,3 -25.89,5.16,Male,Yes,Sat,Dinner,4 -48.33,9.0,Male,No,Sat,Dinner,4 -13.27,2.5,Female,Yes,Sat,Dinner,2 -28.17,6.5,Female,Yes,Sat,Dinner,3 -12.9,1.1,Female,Yes,Sat,Dinner,2 -28.15,3.0,Male,Yes,Sat,Dinner,5 -11.59,1.5,Male,Yes,Sat,Dinner,2 -7.74,1.44,Male,Yes,Sat,Dinner,2 -30.14,3.09,Female,Yes,Sat,Dinner,4 -12.16,2.2,Male,Yes,Fri,Lunch,2 -13.42,3.48,Female,Yes,Fri,Lunch,2 -8.58,1.92,Male,Yes,Fri,Lunch,1 -15.98,3.0,Female,No,Fri,Lunch,3 -13.42,1.58,Male,Yes,Fri,Lunch,2 -16.27,2.5,Female,Yes,Fri,Lunch,2 -10.09,2.0,Female,Yes,Fri,Lunch,2 -20.45,3.0,Male,No,Sat,Dinner,4 -13.28,2.72,Male,No,Sat,Dinner,2 -22.12,2.88,Female,Yes,Sat,Dinner,2 -24.01,2.0,Male,Yes,Sat,Dinner,4 -15.69,3.0,Male,Yes,Sat,Dinner,3 -11.61,3.39,Male,No,Sat,Dinner,2 -10.77,1.47,Male,No,Sat,Dinner,2 -15.53,3.0,Male,Yes,Sat,Dinner,2 -10.07,1.25,Male,No,Sat,Dinner,2 -12.6,1.0,Male,Yes,Sat,Dinner,2 -32.83,1.17,Male,Yes,Sat,Dinner,2 -35.83,4.67,Female,No,Sat,Dinner,3 -29.03,5.92,Male,No,Sat,Dinner,3 -27.18,2.0,Female,Yes,Sat,Dinner,2 -22.67,2.0,Male,Yes,Sat,Dinner,2 -17.82,1.75,Male,No,Sat,Dinner,2 -18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv index 5cc466e97cf12..0f7d184728a17 100644 --- a/doc/data/titanic.csv +++ b/doc/data/titanic.csv @@ -1,93 +1,93 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S 2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +3,1,3,"Heikkinen, Miss Laina",female,26,0,0,STON/O2. 3101282,7.925,,S 4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S 5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S 6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q 7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +8,0,3,"Palsson, Master Gosta Leonard",male,2,3,1,349909,21.075,,S 9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S 10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +11,1,3,"Sandstrom, Miss Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss Elizabeth",female,58,0,0,113783,26.55,C103,S 13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S 14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +15,0,3,"Vestrom, Miss Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S 16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +17,0,3,"Rice, Master Eugene",male,2,4,1,382652,29.125,,Q 18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S 19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S 20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C 21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S 22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +23,1,3,"McGowan, Miss Anna ""Annie""",female,15,0,0,330923,8.0292,,Q 24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +25,0,3,"Palsson, Miss Torborg Danira",female,8,3,1,349909,21.075,,S 26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S 27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C 28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +29,1,3,"O'Dwyer, Miss Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q 30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S 31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C 32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +33,1,3,"Glynn, Miss Mary Agatha",female,,0,0,335677,7.75,,Q 34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S 35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C 36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S 37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C 38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +39,0,3,"Vander Planke, Miss Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss Jamila",female,14,1,0,2651,11.2417,,C 41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S 42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S 43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +44,1,2,"Laroche, Miss Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss Margaret Delia",female,19,0,0,330958,7.8792,,Q 46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S 47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +48,1,3,"O'Driscoll, Miss Bridget",female,,0,0,14311,7.75,,Q 49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C 50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +51,0,3,"Panula, Master Juha Niilo",male,7,4,1,3101295,39.6875,,S 52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S 53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C 54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S 55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C 56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +57,1,2,"Rugg, Miss Emily",female,21,0,0,C.A. 31026,10.5,,S 58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +59,1,2,"West, Miss Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master William Frederick",male,11,5,2,CA 2144,46.9,,S 61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +62,1,1,"Icard, Miss Amelie",female,38,0,0,113572,80,B28, 63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +64,0,3,"Skoog, Master Harald",male,4,3,2,347088,27.9,,S 65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +66,1,3,"Moubarek, Master Gerios",male,,1,1,2661,15.2458,,C 67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S 68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +69,1,3,"Andersson, Miss Erna Alexandra",female,17,4,2,3101281,7.925,,S 70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S 71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +72,0,3,"Goodwin, Miss Lillian Amy",female,16,5,2,CA 2144,46.9,,S 73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S 74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C 75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S 76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S 77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S 78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +79,1,2,"Caldwell, Master Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss Elizabeth",female,30,0,0,364516,12.475,,S 81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S 82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +83,1,3,"McDermott, Miss Brigdet Delia",female,,0,0,330932,7.7875,,Q 84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +85,1,2,"Ilett, Miss Bertha",female,17,0,0,SO/C 14885,10.5,,S 86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S 87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S 88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +89,1,1,"Fortune, Miss Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S 90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S 91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S 92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S @@ -99,35 +99,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C 99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S 100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +101,0,3,"Petranec, Miss Matilda",female,28,0,0,349245,7.8958,,S 102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S 103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S 104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S 105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S 106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +107,1,3,"Salkjelsvik, Miss Anna Kristine",female,21,0,0,343120,7.65,,S 108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S 109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +110,1,3,"Moran, Miss Bertha",female,,1,0,371110,24.15,,Q 111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +112,0,3,"Zabour, Miss Hileni",female,14.5,1,0,2665,14.4542,,C 113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +114,0,3,"Jussila, Miss Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss Malake",female,17,0,0,2627,14.4583,,C 116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S 117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q 118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S 119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +120,0,3,"Andersson, Miss Ellis Anna Maria",female,2,4,2,347082,31.275,,S 121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S 122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S 123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +124,1,2,"Webber, Miss Susan",female,32.5,0,0,27267,13,E101,S 125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +126,1,3,"Nicola-Yarred, Master Elias",male,12,1,0,2651,11.2417,,C 127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q 128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +129,1,3,"Peter, Miss Anna",female,,1,1,2668,22.3583,F E69,C 130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S 131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C 132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S @@ -135,18 +135,18 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S 135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S 136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +137,1,1,"Newsom, Miss Helen Monypeny",female,19,0,2,11752,26.2833,D47,S 138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S 139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S 140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C 141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +142,1,3,"Nysten, Miss Anna Sofia",female,22,0,0,347081,7.75,,S 143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S 144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q 145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S 146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S 147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +148,0,3,"Ford, Miss Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S 149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S 150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S 151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S @@ -155,35 +155,35 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S 155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S 156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +157,1,3,"Gilnagh, Miss Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q 158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S 159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +160,0,3,"Sage, Master Thomas Henry",male,,8,2,CA. 2343,69.55,,S 161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S 162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S 163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S 164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +165,0,3,"Panula, Master Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S 167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S 168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S 169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S 170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S 171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +172,0,3,"Rice, Master Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss Eleanor Ileen",female,1,1,1,347742,11.1333,,S 174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S 175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C 176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +177,0,3,"Lefebre, Master Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C 179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S 180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +181,0,3,"Sage, Miss Constance Gladys",female,,8,2,CA. 2343,69.55,,S 182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +183,0,3,"Asplund, Master Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss Luise Gretchen",female,4,0,2,315153,22.025,,S 186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S 187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q 188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S @@ -191,33 +191,33 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S 191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S 192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +193,1,3,"Andersen-Jensen, Miss Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master Michel M",male,3,1,1,230080,26,F2,S 195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +196,1,1,"Lurette, Miss Elise",female,58,0,0,PC 17569,146.5208,B80,C 197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q 198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +199,1,3,"Madigan, Miss Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S 201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S 202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S 203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S 204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C 205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +206,0,3,"Strom, Miss Telma Matilda",female,2,0,1,347054,10.4625,G6,S 207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S 208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +209,1,3,"Carr, Miss Helen ""Ellen""",female,16,0,0,367231,7.75,,Q 210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C 211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +212,1,2,"Cameron, Miss Clear Annie",female,35,0,0,F.C.C. 13528,21,,S 213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S 214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S 215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +216,1,1,"Newell, Miss Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S 218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +219,1,1,"Bazzani, Miss Albina",female,32,0,0,11813,76.2917,D15,C 220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S 221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S 222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S @@ -228,24 +228,24 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S 228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S 229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +230,0,3,"Lefebre, Miss Mathilde",female,,3,1,4133,25.4667,,S 231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S 232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S 233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +234,1,3,"Asplund, Miss Lillian Gertrud",female,5,4,2,347077,31.3875,,S 235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +236,0,3,"Harknett, Miss Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S 237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +238,1,2,"Collyer, Miss Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S 239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S 240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +241,0,3,"Zabour, Miss Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss Katherine ""Kate""",female,,1,0,367230,15.5,,Q 243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S 244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S 245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C 246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +247,0,3,"Lindahl, Miss Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S 248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S 249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S 250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S @@ -256,28 +256,28 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S 256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C 257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +258,1,1,"Cherry, Miss Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss Anna",female,35,0,0,PC 17755,512.3292,,C 260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S 261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +262,1,3,"Asplund, Master Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S 263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S 264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +265,0,3,"Henry, Miss Delia",female,,0,0,382649,7.75,,Q 266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S 267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S 268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S 269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +270,1,1,"Bissette, Miss Amelia",female,35,0,0,PC 17760,135.6333,C99,S 271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S 272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S 273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S 274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +275,1,3,"Healy, Miss Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss Augusta Charlotta",female,45,0,0,347073,7.75,,S 278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +279,0,3,"Rice, Master Eric",male,7,4,1,382652,29.125,,Q 280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S 281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q 282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S @@ -288,66 +288,66 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S 288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S 289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +290,1,3,"Connolly, Miss Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss Ellen ""Nellie""",female,26,0,0,19877,78.85,,S 292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C 293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +294,0,3,"Haas, Miss Aloisia",female,24,0,0,349236,8.85,,S 295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S 296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C 297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +298,0,1,"Allison, Miss Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S 299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S 300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +301,1,3,"Kelly, Miss Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q 302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q 303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +304,1,2,"Keane, Miss Nora A",female,,0,0,226593,12.35,E101,Q 305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +306,1,1,"Allison, Master Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss Margaret",female,,0,0,17421,110.8833,,C 308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C 309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +310,1,1,"Francatelli, Miss Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C 313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S 314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S 315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +316,1,3,"Nilsson, Miss Helmina Josefina",female,26,0,0,347470,7.8542,,S 317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S 318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +319,1,1,"Wick, Miss Mary Natalie",female,31,0,2,36928,164.8667,C7,S 320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C 321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S 322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +323,1,2,"Slayter, Miss Hilda Mary",female,30,0,0,234818,12.35,,Q 324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S 325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +326,1,1,"Young, Miss Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C 327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S 328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S 329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +330,1,1,"Hippach, Miss Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss Agnes",female,,2,0,367226,23.25,,Q 332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S 333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S 334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S 335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S 336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S 337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +338,1,1,"Burns, Miss Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C 339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S 340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +341,1,2,"Navratil, Master Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S 343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S 344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S 345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +346,1,2,"Brown, Miss Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss Marion Elsie",female,40,0,0,31418,13,,S 348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +349,1,3,"Coutts, Master William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S 350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S 351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S 352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S @@ -355,10 +355,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S 355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C 356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +357,1,1,"Bowerman, Miss Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q 361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S 362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C 363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C @@ -367,58 +367,58 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S 367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C 368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +369,1,3,"Jermyn, Miss Annie",female,,0,0,14313,7.75,,Q 370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C 371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C 372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S 373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S 374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +375,0,3,"Palsson, Miss Stina Viola",female,3,3,1,349909,21.075,,S 376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +377,1,3,"Landergren, Miss Aurora Adelia",female,22,0,0,C 7077,7.25,,S 378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C 379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C 380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +381,1,1,"Bidois, Miss Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss Maria (""Mary"")",female,1,0,2,2653,15.7417,,C 383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S 384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S 385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S 386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +387,0,3,"Goodwin, Master Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss Kate",female,36,0,0,27849,13,,S 389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +390,1,2,"Lehmann, Miss Bertha",female,17,0,0,SC 1748,12,,C 391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S 392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S 393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +394,1,1,"Newell, Miss Marjorie",female,23,1,0,35273,113.275,D36,C 395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S 396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +397,0,3,"Olsson, Miss Elina",female,31,0,0,350407,7.8542,,S 398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S 399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S 400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S 401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S 402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +403,0,3,"Jussila, Miss Mari Aina",female,21,1,0,4137,9.825,,S 404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +405,0,3,"Oreskovic, Miss Marija",female,20,0,0,315096,8.6625,,S 406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S 407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +408,1,2,"Richards, Master William Rowe",male,3,1,1,29106,18.75,,S 409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +410,0,3,"Lefebre, Miss Ida",female,,3,1,4133,25.4667,,S 411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S 412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +413,1,1,"Minahan, Miss Daisy E",female,33,1,0,19928,90,C78,Q 414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S 415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S 416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S 417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +418,1,2,"Silven, Miss Lyyli Karoliina",female,18,0,2,250652,13,,S 419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +420,0,3,"Van Impe, Miss Catharina",female,10,0,2,345773,24.15,,S 421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C 422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q 423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S @@ -426,7 +426,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S 426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S 427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +428,1,2,"Phillips, Miss Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S 429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q 430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S 431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S @@ -434,8 +434,8 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S 434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S 435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +436,1,1,"Carter, Miss Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S 438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S 439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S 440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S @@ -444,10 +444,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S 444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S 445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +446,1,1,"Dodge, Master Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss Madeleine Violet",female,13,0,1,250644,19.5,,S 448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +449,1,3,"Baclini, Miss Marie Catherine",female,5,2,1,2666,19.2583,,C 450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S 451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S 452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S @@ -457,7 +457,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C 457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S 458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +459,1,2,"Toomey, Miss Ellen",female,50,0,0,F.C.C. 13531,10.5,,S 460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q 461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S 462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S @@ -468,42 +468,42 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S 468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S 469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +470,1,3,"Baclini, Miss Helene Barbara",female,0.75,2,1,2666,19.2583,,C 471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S 472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S 473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S 474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +475,0,3,"Strandberg, Miss Ida Sofia",female,22,0,0,7553,9.8375,,S 476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S 477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S 478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S 479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +480,1,3,"Hirvonen, Miss Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master Harold Victor",male,9,5,2,CA 2144,46.9,,S 482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S 483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S 484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S 485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +486,0,3,"Lefebre, Miss Jeannie",female,,3,1,4133,25.4667,,S 487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S 488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C 489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +490,1,3,"Coutts, Master Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S 491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S 492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S 493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S 494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C 495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S 496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +497,1,1,"Eustis, Miss Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C 498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S 499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S 500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S 501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +502,0,3,"Canavan, Miss Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss Roberta",female,16,0,0,110152,86.5,B79,S 506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C 507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S 508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S @@ -519,41 +519,41 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q 519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S 520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +521,1,1,"Perreault, Miss Anne",female,30,0,0,12749,93.5,B73,S 522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S 523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C 524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C 525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C 526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +527,1,2,"Ridsdale, Miss Lucy",female,50,0,0,W./C. 14258,10.5,,S 528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S 529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S 530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +531,1,2,"Quick, Miss Phyllis May",female,2,1,1,26360,26,,S 532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C 533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C 534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +535,0,3,"Cacic, Miss Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S 537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +538,1,1,"LeRoy, Miss Bertha",female,30,0,0,PC 17761,106.425,,C 539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +540,1,1,"Frolicher, Miss Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss Sigrid Elisabeth",female,11,4,2,347082,31.275,,S 544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S 545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C 546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S 547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S 548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C 549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +550,1,2,"Davies, Master John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S 551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C 552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S 553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q 554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +555,1,3,"Ohman, Miss Velin",female,22,0,0,347085,7.775,,S 556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S 557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C 558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C @@ -563,7 +563,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S 563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S 564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +565,0,3,"Meanwell, Miss (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S 566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S 567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S 568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S @@ -572,19 +572,19 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S 572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S 573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +574,1,3,"Kelly, Miss Mary",female,,0,0,14312,7.75,,Q 575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S 576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +577,1,2,"Garside, Miss Ethel",female,34,0,0,243880,13,,S 578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S 579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C 580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +581,1,2,"Christy, Miss Julie Rachel",female,25,1,1,237789,30,,S 582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C 583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S 584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C 585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +586,1,1,"Taussig, Miss Ruth",female,18,0,2,110413,79.65,E68,S 587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S 588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C 589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S @@ -592,10 +592,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S 592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C 593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +594,0,3,"Bourke, Miss Mary",female,,0,2,364848,7.75,,Q 595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S 596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +597,1,2,"Leitch, Miss Jessie Wills",female,,0,0,248727,33,,S 598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S 599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C 600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C @@ -608,16 +608,16 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S 608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S 609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +610,1,1,"Shutes, Miss Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S 611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S 612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +613,1,3,"Murphy, Miss Margaret Jane",female,,1,0,367230,15.5,,Q 614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q 615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +616,1,2,"Herman, Miss Alice",female,24,1,2,220845,65,,S 617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S 618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +619,1,2,"Becker, Miss Marion Louise",female,4,2,1,230136,39,F4,S 620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S 621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C 622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S @@ -626,34 +626,34 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S 626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S 627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +628,1,1,"Longley, Miss Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S 629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S 630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q 631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S 632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S 633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C 634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +635,0,3,"Skoog, Miss Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss Mary",female,28,0,0,237668,13,,S 637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S 638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S 639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S 640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S 641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S 642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +643,0,3,"Skoog, Miss Margit Elizabeth",female,2,3,2,347088,27.9,,S 644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +645,1,3,"Baclini, Miss Eugenie",female,0.75,2,1,2666,19.2583,,C 646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C 647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S 648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C 649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +650,1,3,"Stanley, Miss Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S 651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +652,1,2,"Doling, Miss Elsie",female,18,0,1,231919,23,,S 653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +654,1,3,"O'Leary, Miss Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss Hanora ""Nora""",female,18,0,0,365226,6.75,,Q 656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S 657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S 658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q @@ -676,10 +676,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S 676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S 677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +678,1,3,"Turja, Miss Anna Sofia",female,18,0,0,4138,9.8417,,S 679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S 680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +681,0,3,"Peters, Miss Katie",female,,0,0,330935,8.1375,,Q 682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C 683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S 684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S @@ -688,48 +688,48 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S 688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S 689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +690,1,1,"Madill, Miss Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S 691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +692,1,3,"Karun, Miss Manca",female,4,0,1,349256,13.4167,,C 693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S 694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C 695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S 696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S 697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +698,1,3,"Mullens, Miss Katherine ""Katie""",female,,0,0,35852,7.7333,,Q 699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C 700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S 701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C 702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +703,0,3,"Barbara, Miss Saiide",female,18,0,1,2691,14.4542,,C 704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q 705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S 706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S 707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S 708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +709,1,1,"Cleaver, Miss Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C 711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C 712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S 713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S 714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S 715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S 716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +717,1,1,"Endres, Miss Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S 719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q 720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +721,1,2,"Harper, Miss Annie Jessie ""Nina""",female,6,0,1,248727,33,,S 722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S 723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S 724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S 725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S 726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S 727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +728,1,3,"Mannion, Miss Margareth",female,,0,0,36866,7.7375,,Q 729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +730,0,3,"Ilmakangas, Miss Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S 732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C 733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S 734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S @@ -741,20 +741,20 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S 741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S 742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +743,1,1,"Ryerson, Miss Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C 744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S 745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S 746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S 747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +748,1,2,"Sinkkonen, Miss Anna",female,30,0,0,250648,13,,S 749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S 750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +751,1,2,"Wells, Miss Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master Meier",male,6,0,1,392096,12.475,E121,S 753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S 754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S 755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +756,1,2,"Hamalainen, Master Viljo",male,0.67,1,1,250649,14.5,,S 757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S 758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S 759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S @@ -766,7 +766,7 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S 766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S 767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +768,0,3,"Mangan, Miss Mary",female,30.5,0,0,364850,7.75,,Q 769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q 770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S 771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S @@ -776,22 +776,22 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S 776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S 777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +778,1,3,"Emanuel, Miss Virginia Ethel",female,5,0,0,364516,12.475,,S 779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q 780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +781,1,3,"Ayoub, Miss Banoura",female,13,0,0,2687,7.2292,,C 782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S 783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S 784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S 785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S 786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +787,1,3,"Sjoblom, Miss Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S 790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C 791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q 792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +793,0,3,"Sage, Miss Stella Anna",female,,8,2,CA. 2343,69.55,,S 794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C 795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S 796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S @@ -801,47 +801,47 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S 801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S 802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +803,1,1,"Carter, Master William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master Assad Alexander",male,0.42,0,1,2625,8.5167,,C 805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S 806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S 807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +808,0,3,"Pettersson, Miss Ellen Natalia",female,18,0,0,347087,7.775,,S 809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S 810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S 811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S 812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S 813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +814,0,3,"Andersson, Miss Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S 815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S 816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +817,0,3,"Heininen, Miss Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S 818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C 819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +820,0,3,"Skoog, Master Karl Thorsten",male,10,3,2,347088,27.9,,S 821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S 822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S 823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S 824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +825,0,3,"Panula, Master Urho Abraham",male,2,4,1,3101295,39.6875,,S 826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q 827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +828,1,2,"Mallet, Master Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C 829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q 830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, 831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +832,1,2,"Richards, Master George Sibley",male,0.83,1,1,29106,18.75,,S 833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C 834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S 835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +836,1,1,"Compton, Miss Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C 837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S 838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S 839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S 840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C 841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S 842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +843,1,1,"Serepeca, Miss Augusta",female,30,0,0,113798,31,,C 844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C 845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S 846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S @@ -849,10 +849,10 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C 849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S 850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +851,0,3,"Andersson, Master Sigvard Harald Elias",male,4,4,2,347082,31.275,,S 852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +853,0,3,"Boulos, Miss Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss Mary Conover",female,16,0,1,PC 17592,39.4,D28,S 855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S 856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S 857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S @@ -862,31 +862,31 @@ PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S 862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S 863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +864,0,3,"Sage, Miss Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S 865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S 866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +867,1,2,"Duran y More, Miss Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C 868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S 869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +870,1,3,"Johnson, Master Harold Theodor",male,4,1,1,347742,11.1333,,S 871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S 872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S 873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S 874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S 875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +876,1,3,"Najib, Miss Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C 877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S 878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S 879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S 880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C 881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S 882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +883,0,3,"Dahlberg, Miss Gerda Ulrika",female,22,0,0,7552,10.5167,,S 884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S 885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S 886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q 887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +888,1,1,"Graham, Miss Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S 890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C 891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/doc/make.py b/doc/make.py index c758c7fc84bbb..9542563dc037b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,6 +11,7 @@ $ python make.py html $ python make.py latex """ + import argparse import csv import importlib @@ -45,12 +46,14 @@ def __init__( single_doc=None, verbosity=0, warnings_are_errors=False, + no_browser=False, ) -> None: self.num_jobs = num_jobs self.include_api = include_api self.whatsnew = whatsnew self.verbosity = verbosity self.warnings_are_errors = warnings_are_errors + self.no_browser = no_browser if single_doc: single_doc = self._process_single_doc(single_doc) @@ -100,7 +103,7 @@ def _process_single_doc(self, single_doc): ) @staticmethod - def _run_os(*args): + def _run_os(*args) -> None: """ Execute a command as a OS terminal. @@ -111,7 +114,7 @@ def _run_os(*args): Examples -------- - >>> DocBuilder()._run_os('python', '--version') + >>> DocBuilder()._run_os("python", "--version") """ subprocess.check_call(args, stdout=sys.stdout, stderr=sys.stderr) @@ -123,14 +126,14 @@ def _sphinx_build(self, kind: str): Parameters ---------- - kind : {'html', 'latex'} + kind : {'html', 'latex', 'linkcheck'} Examples -------- - >>> DocBuilder(num_jobs=4)._sphinx_build('html') + >>> DocBuilder(num_jobs=4)._sphinx_build("html") """ - if kind not in ("html", "latex"): - raise ValueError(f"kind must be html or latex, not {kind}") + if kind not in ("html", "latex", "linkcheck"): + raise ValueError(f"kind must be html, latex or linkcheck, not {kind}") cmd = ["sphinx-build", "-b", kind] if self.num_jobs: @@ -147,7 +150,7 @@ def _sphinx_build(self, kind: str): ] return subprocess.call(cmd) - def _open_browser(self, single_doc_html): + def _open_browser(self, single_doc_html) -> None: """ Open a browser tab showing single """ @@ -159,16 +162,16 @@ def _get_page_title(self, page): Open the rst file `page` and extract its title. """ fname = os.path.join(SOURCE_PATH, f"{page}.rst") - option_parser = docutils.frontend.OptionParser( - components=(docutils.parsers.rst.Parser,) + doc = docutils.utils.new_document( + "", + docutils.frontend.get_default_settings(docutils.parsers.rst.Parser), ) - doc = docutils.utils.new_document("", option_parser.get_default_values()) - with open(fname) as f: + with open(fname, encoding="utf-8") as f: data = f.read() parser = docutils.parsers.rst.Parser() # do not generate any warning when parsing the rst - with open(os.devnull, "a") as f: + with open(os.devnull, "a", encoding="utf-8") as f: doc.reporter.stream = f parser.parse(data, doc) @@ -181,12 +184,12 @@ def _get_page_title(self, page): return title.astext() - def _add_redirects(self): + def _add_redirects(self) -> None: """ Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - with open(REDIRECTS_FILE) as mapping_fd: + with open(REDIRECTS_FILE, encoding="utf-8") as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: if not row or row[0].strip().startswith("#"): @@ -209,7 +212,7 @@ def _add_redirects(self): # sphinx specific stuff title = "this page" - with open(path, "w") as moved_page_fd: + with open(path, "w", encoding="utf-8") as moved_page_fd: html = f"""\ @@ -231,14 +234,15 @@ def html(self): ret_code = self._sphinx_build("html") zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): - os.remove(zip_fname) + os.remove(zip_fname) # noqa: TID251 if ret_code == 0: if self.single_doc_html is not None: - self._open_browser(self.single_doc_html) + if not self.no_browser: + self._open_browser(self.single_doc_html) else: self._add_redirects() - if self.whatsnew: + if self.whatsnew and not self.no_browser: self._open_browser(os.path.join("whatsnew", "index.html")) return ret_code @@ -256,11 +260,9 @@ def latex(self, force=False): for i in range(3): self._run_os("pdflatex", "-interaction=nonstopmode", "pandas.tex") raise SystemExit( - "You should check the file " - '"build/latex/pandas.pdf" for problems.' + 'You should check the file "build/latex/pandas.pdf" for problems.' ) - else: - self._run_os("make") + self._run_os("make") return ret_code def latex_forced(self): @@ -270,25 +272,31 @@ def latex_forced(self): return self.latex(force=True) @staticmethod - def clean(): + def clean() -> None: """ Clean documentation generated files. """ shutil.rmtree(BUILD_PATH, ignore_errors=True) shutil.rmtree(os.path.join(SOURCE_PATH, "reference", "api"), ignore_errors=True) - def zip_html(self): + def zip_html(self) -> None: """ Compress HTML documentation into a zip file. """ zip_fname = os.path.join(BUILD_PATH, "html", "pandas.zip") if os.path.exists(zip_fname): - os.remove(zip_fname) + os.remove(zip_fname) # noqa: TID251 dirname = os.path.join(BUILD_PATH, "html") fnames = os.listdir(dirname) os.chdir(dirname) self._run_os("zip", zip_fname, "-r", "-q", *fnames) + def linkcheck(self): + """ + Check for broken links in the documentation. + """ + return self._sphinx_build("linkcheck") + def main(): cmds = [method for method in dir(DocBuilder) if not method.startswith("_")] @@ -322,7 +330,7 @@ def main(): help=( "filename (relative to the 'source' folder) of section or method name to " "compile, e.g. 'development/contributing.rst', " - "'ecosystem.rst', 'pandas.DataFrame.join'" + "'pandas.DataFrame.join'" ), ) argparser.add_argument( @@ -334,8 +342,7 @@ def main(): dest="verbosity", default=0, help=( - "increase verbosity (can be repeated), " - "passed to the sphinx build command" + "increase verbosity (can be repeated), passed to the sphinx build command" ), ) argparser.add_argument( @@ -344,6 +351,12 @@ def main(): action="store_true", help="fail if warnings are raised", ) + argparser.add_argument( + "--no-browser", + help="Don't open browser", + default=False, + action="store_true", + ) args = argparser.parse_args() if args.command not in cmds: @@ -369,6 +382,7 @@ def main(): args.single, args.verbosity, args.warnings_are_errors, + args.no_browser, ) return getattr(builder, args.command)() diff --git a/doc/redirects.csv b/doc/redirects.csv index 90ddf6c4dc582..c11e4e242f128 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -45,6 +45,7 @@ contributing_docstring,development/contributing_docstring developer,development/developer extending,development/extending internals,development/internals +development/meeting,community # api moved function reference/api/pandas.io.json.json_normalize,pandas.json_normalize @@ -99,8 +100,6 @@ generated/pandas.api.extensions.register_series_accessor,../reference/api/pandas generated/pandas.api.types.infer_dtype,../reference/api/pandas.api.types.infer_dtype generated/pandas.api.types.is_bool_dtype,../reference/api/pandas.api.types.is_bool_dtype generated/pandas.api.types.is_bool,../reference/api/pandas.api.types.is_bool -generated/pandas.api.types.is_categorical_dtype,../reference/api/pandas.api.types.is_categorical_dtype -generated/pandas.api.types.is_categorical,../reference/api/pandas.api.types.is_categorical generated/pandas.api.types.is_complex_dtype,../reference/api/pandas.api.types.is_complex_dtype generated/pandas.api.types.is_complex,../reference/api/pandas.api.types.is_complex generated/pandas.api.types.is_datetime64_any_dtype,../reference/api/pandas.api.types.is_datetime64_any_dtype @@ -119,7 +118,6 @@ generated/pandas.api.types.is_int64_dtype,../reference/api/pandas.api.types.is_i generated/pandas.api.types.is_integer_dtype,../reference/api/pandas.api.types.is_integer_dtype generated/pandas.api.types.is_integer,../reference/api/pandas.api.types.is_integer generated/pandas.api.types.is_interval_dtype,../reference/api/pandas.api.types.is_interval_dtype -generated/pandas.api.types.is_interval,../reference/api/pandas.api.types.is_interval generated/pandas.api.types.is_iterator,../reference/api/pandas.api.types.is_iterator generated/pandas.api.types.is_list_like,../reference/api/pandas.api.types.is_list_like generated/pandas.api.types.is_named_tuple,../reference/api/pandas.api.types.is_named_tuple @@ -127,7 +125,6 @@ generated/pandas.api.types.is_number,../reference/api/pandas.api.types.is_number generated/pandas.api.types.is_numeric_dtype,../reference/api/pandas.api.types.is_numeric_dtype generated/pandas.api.types.is_object_dtype,../reference/api/pandas.api.types.is_object_dtype generated/pandas.api.types.is_period_dtype,../reference/api/pandas.api.types.is_period_dtype -generated/pandas.api.types.is_period,../reference/api/pandas.api.types.is_period generated/pandas.api.types.is_re_compilable,../reference/api/pandas.api.types.is_re_compilable generated/pandas.api.types.is_re,../reference/api/pandas.api.types.is_re generated/pandas.api.types.is_scalar,../reference/api/pandas.api.types.is_scalar @@ -185,7 +182,6 @@ generated/pandas.core.groupby.DataFrameGroupBy.filter,../reference/api/pandas.co generated/pandas.core.groupby.DataFrameGroupBy.hist,../reference/api/pandas.core.groupby.DataFrameGroupBy.hist generated/pandas.core.groupby.DataFrameGroupBy.idxmax,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmax generated/pandas.core.groupby.DataFrameGroupBy.idxmin,../reference/api/pandas.core.groupby.DataFrameGroupBy.idxmin -generated/pandas.core.groupby.DataFrameGroupBy.mad,../reference/api/pandas.core.groupby.DataFrameGroupBy.mad generated/pandas.core.groupby.DataFrameGroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change generated/pandas.core.groupby.DataFrameGroupBy.plot,../reference/api/pandas.core.groupby.DataFrameGroupBy.plot generated/pandas.core.groupby.DataFrameGroupBy.quantile,../reference/api/pandas.core.groupby.DataFrameGroupBy.quantile @@ -195,41 +191,39 @@ generated/pandas.core.groupby.DataFrameGroupBy.shift,../reference/api/pandas.cor generated/pandas.core.groupby.DataFrameGroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size generated/pandas.core.groupby.DataFrameGroupBy.skew,../reference/api/pandas.core.groupby.DataFrameGroupBy.skew generated/pandas.core.groupby.DataFrameGroupBy.take,../reference/api/pandas.core.groupby.DataFrameGroupBy.take -generated/pandas.core.groupby.DataFrameGroupBy.tshift,../reference/api/pandas.core.groupby.DataFrameGroupBy.tshift -generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.GroupBy.agg -generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.GroupBy.aggregate -generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.GroupBy.all -generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.GroupBy.any -generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.GroupBy.apply -generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.GroupBy.bfill -generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.GroupBy.count -generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.GroupBy.cumcount -generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.GroupBy.ffill -generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.GroupBy.first -generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.GroupBy.get_group -generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.GroupBy.groups -generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.GroupBy.head -generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.GroupBy.indices -generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.GroupBy.__iter__ -generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.GroupBy.last -generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.GroupBy.max -generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.GroupBy.mean -generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.GroupBy.median -generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.GroupBy.min -generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.GroupBy.ngroup -generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.GroupBy.nth -generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.GroupBy.ohlc -generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.GroupBy.pct_change -generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.GroupBy.pipe -generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.GroupBy.prod -generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.GroupBy.rank -generated/pandas.core.groupby.GroupBy.sem,../reference/api/pandas.core.groupby.GroupBy.sem -generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.GroupBy.size -generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.GroupBy.std -generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.GroupBy.sum -generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.GroupBy.tail -generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.GroupBy.transform -generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.GroupBy.var +generated/pandas.core.groupby.GroupBy.agg,../reference/api/pandas.core.groupby.DataFrameGroupBy.agg +generated/pandas.core.groupby.GroupBy.aggregate,../reference/api/pandas.core.groupby.DataFrameGroupBy.aggregate +generated/pandas.core.groupby.GroupBy.all,../reference/api/pandas.core.groupby.DataFrameGroupBy.all +generated/pandas.core.groupby.GroupBy.any,../reference/api/pandas.core.groupby.DataFrameGroupBy.any +generated/pandas.core.groupby.GroupBy.apply,../reference/api/pandas.core.groupby.DataFrameGroupBy.apply +generated/pandas.core.groupby.GroupBy.bfill,../reference/api/pandas.core.groupby.DataFrameGroupBy.bfill +generated/pandas.core.groupby.GroupBy.count,../reference/api/pandas.core.groupby.DataFrameGroupBy.count +generated/pandas.core.groupby.GroupBy.cumcount,../reference/api/pandas.core.groupby.DataFrameGroupBy.cumcount +generated/pandas.core.groupby.GroupBy.ffill,../reference/api/pandas.core.groupby.DataFrameGroupBy.ffill +generated/pandas.core.groupby.GroupBy.first,../reference/api/pandas.core.groupby.DataFrameGroupBy.first +generated/pandas.core.groupby.GroupBy.get_group,../reference/api/pandas.core.groupby.DataFrameGroupBy.get_group +generated/pandas.core.groupby.GroupBy.groups,../reference/api/pandas.core.groupby.DataFrameGroupBy.groups +generated/pandas.core.groupby.GroupBy.head,../reference/api/pandas.core.groupby.DataFrameGroupBy.head +generated/pandas.core.groupby.GroupBy.indices,../reference/api/pandas.core.groupby.DataFrameGroupBy.indices +generated/pandas.core.groupby.GroupBy.__iter__,../reference/api/pandas.core.groupby.DataFrameGroupBy.__iter__ +generated/pandas.core.groupby.GroupBy.last,../reference/api/pandas.core.groupby.DataFrameGroupBy.last +generated/pandas.core.groupby.GroupBy.max,../reference/api/pandas.core.groupby.DataFrameGroupBy.max +generated/pandas.core.groupby.GroupBy.mean,../reference/api/pandas.core.groupby.DataFrameGroupBy.mean +generated/pandas.core.groupby.GroupBy.median,../reference/api/pandas.core.groupby.DataFrameGroupBy.median +generated/pandas.core.groupby.GroupBy.min,../reference/api/pandas.core.groupby.DataFrameGroupBy.min +generated/pandas.core.groupby.GroupBy.ngroup,../reference/api/pandas.core.groupby.DataFrameGroupBy.ngroup +generated/pandas.core.groupby.GroupBy.nth,../reference/api/pandas.core.groupby.DataFrameGroupBy.nth +generated/pandas.core.groupby.GroupBy.ohlc,../reference/api/pandas.core.groupby.DataFrameGroupBy.ohlc +generated/pandas.core.groupby.GroupBy.pct_change,../reference/api/pandas.core.groupby.DataFrameGroupBy.pct_change +generated/pandas.core.groupby.GroupBy.pipe,../reference/api/pandas.core.groupby.DataFrameGroupBy.pipe +generated/pandas.core.groupby.GroupBy.prod,../reference/api/pandas.core.groupby.DataFrameGroupBy.prod +generated/pandas.core.groupby.GroupBy.rank,../reference/api/pandas.core.groupby.DataFrameGroupBy.rank +generated/pandas.core.groupby.GroupBy.size,../reference/api/pandas.core.groupby.DataFrameGroupBy.size +generated/pandas.core.groupby.GroupBy.std,../reference/api/pandas.core.groupby.DataFrameGroupBy.std +generated/pandas.core.groupby.GroupBy.sum,../reference/api/pandas.core.groupby.DataFrameGroupBy.sum +generated/pandas.core.groupby.GroupBy.tail,../reference/api/pandas.core.groupby.DataFrameGroupBy.tail +generated/pandas.core.groupby.GroupBy.transform,../reference/api/pandas.core.groupby.DataFrameGroupBy.transform +generated/pandas.core.groupby.GroupBy.var,../reference/api/pandas.core.groupby.DataFrameGroupBy.var generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing generated/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing,../reference/api/pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing generated/pandas.core.groupby.SeriesGroupBy.nlargest,../reference/api/pandas.core.groupby.SeriesGroupBy.nlargest @@ -240,11 +234,10 @@ generated/pandas.core.groupby.SeriesGroupBy.value_counts,../reference/api/pandas generated/pandas.core.resample.Resampler.aggregate,../reference/api/pandas.core.resample.Resampler.aggregate generated/pandas.core.resample.Resampler.apply,../reference/api/pandas.core.resample.Resampler.apply generated/pandas.core.resample.Resampler.asfreq,../reference/api/pandas.core.resample.Resampler.asfreq -generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.backfill +generated/pandas.core.resample.Resampler.backfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.bfill,../reference/api/pandas.core.resample.Resampler.bfill generated/pandas.core.resample.Resampler.count,../reference/api/pandas.core.resample.Resampler.count generated/pandas.core.resample.Resampler.ffill,../reference/api/pandas.core.resample.Resampler.ffill -generated/pandas.core.resample.Resampler.fillna,../reference/api/pandas.core.resample.Resampler.fillna generated/pandas.core.resample.Resampler.first,../reference/api/pandas.core.resample.Resampler.first generated/pandas.core.resample.Resampler.get_group,../reference/api/pandas.core.resample.Resampler.get_group generated/pandas.core.resample.Resampler.groups,../reference/api/pandas.core.resample.Resampler.groups @@ -259,7 +252,6 @@ generated/pandas.core.resample.Resampler.min,../reference/api/pandas.core.resamp generated/pandas.core.resample.Resampler.nearest,../reference/api/pandas.core.resample.Resampler.nearest generated/pandas.core.resample.Resampler.nunique,../reference/api/pandas.core.resample.Resampler.nunique generated/pandas.core.resample.Resampler.ohlc,../reference/api/pandas.core.resample.Resampler.ohlc -generated/pandas.core.resample.Resampler.pad,../reference/api/pandas.core.resample.Resampler.pad generated/pandas.core.resample.Resampler.pipe,../reference/api/pandas.core.resample.Resampler.pipe generated/pandas.core.resample.Resampler.prod,../reference/api/pandas.core.resample.Resampler.prod generated/pandas.core.resample.Resampler.quantile,../reference/api/pandas.core.resample.Resampler.quantile @@ -317,9 +309,7 @@ generated/pandas.DataFrame.aggregate,../reference/api/pandas.DataFrame.aggregate generated/pandas.DataFrame.align,../reference/api/pandas.DataFrame.align generated/pandas.DataFrame.all,../reference/api/pandas.DataFrame.all generated/pandas.DataFrame.any,../reference/api/pandas.DataFrame.any -generated/pandas.DataFrame.append,../reference/api/pandas.DataFrame.append generated/pandas.DataFrame.apply,../reference/api/pandas.DataFrame.apply -generated/pandas.DataFrame.applymap,../reference/api/pandas.DataFrame.applymap generated/pandas.DataFrame.as_blocks,../reference/api/pandas.DataFrame.as_blocks generated/pandas.DataFrame.asfreq,../reference/api/pandas.DataFrame.asfreq generated/pandas.DataFrame.as_matrix,../reference/api/pandas.DataFrame.as_matrix @@ -332,7 +322,6 @@ generated/pandas.DataFrame.axes,../reference/api/pandas.DataFrame.axes generated/pandas.DataFrame.between_time,../reference/api/pandas.DataFrame.between_time generated/pandas.DataFrame.bfill,../reference/api/pandas.DataFrame.bfill generated/pandas.DataFrame.blocks,../reference/api/pandas.DataFrame.blocks -generated/pandas.DataFrame.bool,../reference/api/pandas.DataFrame.bool generated/pandas.DataFrame.boxplot,../reference/api/pandas.DataFrame.boxplot generated/pandas.DataFrame.clip,../reference/api/pandas.DataFrame.clip generated/pandas.DataFrame.clip_lower,../reference/api/pandas.DataFrame.clip_lower @@ -400,10 +389,8 @@ generated/pandas.DataFrame.isna,../reference/api/pandas.DataFrame.isna generated/pandas.DataFrame.isnull,../reference/api/pandas.DataFrame.isnull generated/pandas.DataFrame.items,../reference/api/pandas.DataFrame.items generated/pandas.DataFrame.__iter__,../reference/api/pandas.DataFrame.__iter__ -generated/pandas.DataFrame.iteritems,../reference/api/pandas.DataFrame.iteritems generated/pandas.DataFrame.iterrows,../reference/api/pandas.DataFrame.iterrows generated/pandas.DataFrame.itertuples,../reference/api/pandas.DataFrame.itertuples -generated/pandas.DataFrame.ix,../reference/api/pandas.DataFrame.ix generated/pandas.DataFrame.join,../reference/api/pandas.DataFrame.join generated/pandas.DataFrame.keys,../reference/api/pandas.DataFrame.keys generated/pandas.DataFrame.kurt,../reference/api/pandas.DataFrame.kurt @@ -412,9 +399,7 @@ generated/pandas.DataFrame.last,../reference/api/pandas.DataFrame.last generated/pandas.DataFrame.last_valid_index,../reference/api/pandas.DataFrame.last_valid_index generated/pandas.DataFrame.le,../reference/api/pandas.DataFrame.le generated/pandas.DataFrame.loc,../reference/api/pandas.DataFrame.loc -generated/pandas.DataFrame.lookup,../reference/api/pandas.DataFrame.lookup generated/pandas.DataFrame.lt,../reference/api/pandas.DataFrame.lt -generated/pandas.DataFrame.mad,../reference/api/pandas.DataFrame.mad generated/pandas.DataFrame.mask,../reference/api/pandas.DataFrame.mask generated/pandas.DataFrame.max,../reference/api/pandas.DataFrame.max generated/pandas.DataFrame.mean,../reference/api/pandas.DataFrame.mean @@ -486,7 +471,6 @@ generated/pandas.DataFrame.shape,../reference/api/pandas.DataFrame.shape generated/pandas.DataFrame.shift,../reference/api/pandas.DataFrame.shift generated/pandas.DataFrame.size,../reference/api/pandas.DataFrame.size generated/pandas.DataFrame.skew,../reference/api/pandas.DataFrame.skew -generated/pandas.DataFrame.slice_shift,../reference/api/pandas.DataFrame.slice_shift generated/pandas.DataFrame.sort_index,../reference/api/pandas.DataFrame.sort_index generated/pandas.DataFrame.sort_values,../reference/api/pandas.DataFrame.sort_values generated/pandas.DataFrame.squeeze,../reference/api/pandas.DataFrame.squeeze @@ -496,7 +480,6 @@ generated/pandas.DataFrame.style,../reference/api/pandas.DataFrame.style generated/pandas.DataFrame.sub,../reference/api/pandas.DataFrame.sub generated/pandas.DataFrame.subtract,../reference/api/pandas.DataFrame.subtract generated/pandas.DataFrame.sum,../reference/api/pandas.DataFrame.sum -generated/pandas.DataFrame.swapaxes,../reference/api/pandas.DataFrame.swapaxes generated/pandas.DataFrame.swaplevel,../reference/api/pandas.DataFrame.swaplevel generated/pandas.DataFrame.tail,../reference/api/pandas.DataFrame.tail generated/pandas.DataFrame.take,../reference/api/pandas.DataFrame.take @@ -507,7 +490,6 @@ generated/pandas.DataFrame.to_csv,../reference/api/pandas.DataFrame.to_csv generated/pandas.DataFrame.to_dict,../reference/api/pandas.DataFrame.to_dict generated/pandas.DataFrame.to_excel,../reference/api/pandas.DataFrame.to_excel generated/pandas.DataFrame.to_feather,../reference/api/pandas.DataFrame.to_feather -generated/pandas.DataFrame.to_gbq,../reference/api/pandas.DataFrame.to_gbq generated/pandas.DataFrame.to_hdf,../reference/api/pandas.DataFrame.to_hdf generated/pandas.DataFrame.to,../reference/api/pandas.DataFrame.to generated/pandas.DataFrame.to_json,../reference/api/pandas.DataFrame.to_json @@ -527,7 +509,6 @@ generated/pandas.DataFrame.transform,../reference/api/pandas.DataFrame.transform generated/pandas.DataFrame.transpose,../reference/api/pandas.DataFrame.transpose generated/pandas.DataFrame.truediv,../reference/api/pandas.DataFrame.truediv generated/pandas.DataFrame.truncate,../reference/api/pandas.DataFrame.truncate -generated/pandas.DataFrame.tshift,../reference/api/pandas.DataFrame.tshift generated/pandas.DataFrame.tz_convert,../reference/api/pandas.DataFrame.tz_convert generated/pandas.DataFrame.tz_localize,../reference/api/pandas.DataFrame.tz_localize generated/pandas.DataFrame.unstack,../reference/api/pandas.DataFrame.unstack @@ -574,7 +555,6 @@ generated/pandas.DatetimeIndex.strftime,../reference/api/pandas.DatetimeIndex.st generated/pandas.DatetimeIndex.time,../reference/api/pandas.DatetimeIndex.time generated/pandas.DatetimeIndex.timetz,../reference/api/pandas.DatetimeIndex.timetz generated/pandas.DatetimeIndex.to_frame,../reference/api/pandas.DatetimeIndex.to_frame -generated/pandas.DatetimeIndex.to_perioddelta,../reference/api/pandas.DatetimeIndex.to_perioddelta generated/pandas.DatetimeIndex.to_period,../reference/api/pandas.DatetimeIndex.to_period generated/pandas.DatetimeIndex.to_pydatetime,../reference/api/pandas.DatetimeIndex.to_pydatetime generated/pandas.DatetimeIndex.to_series,../reference/api/pandas.DatetimeIndex.to_series @@ -636,7 +616,6 @@ generated/pandas.Index.argmax,../reference/api/pandas.Index.argmax generated/pandas.Index.argmin,../reference/api/pandas.Index.argmin generated/pandas.Index.argsort,../reference/api/pandas.Index.argsort generated/pandas.Index.array,../reference/api/pandas.Index.array -generated/pandas.Index.asi8,../reference/api/pandas.Index.asi8 generated/pandas.Index.asof,../reference/api/pandas.Index.asof generated/pandas.Index.asof_locs,../reference/api/pandas.Index.asof_locs generated/pandas.Index.astype,../reference/api/pandas.Index.astype @@ -661,7 +640,6 @@ generated/pandas.Index.get_indexer_non_unique,../reference/api/pandas.Index.get_ generated/pandas.Index.get_level_values,../reference/api/pandas.Index.get_level_values generated/pandas.Index.get_loc,../reference/api/pandas.Index.get_loc generated/pandas.Index.get_slice_bound,../reference/api/pandas.Index.get_slice_bound -generated/pandas.Index.get_value,../reference/api/pandas.Index.get_value generated/pandas.Index.groupby,../reference/api/pandas.Index.groupby generated/pandas.Index.has_duplicates,../reference/api/pandas.Index.has_duplicates generated/pandas.Index.hasnans,../reference/api/pandas.Index.hasnans @@ -671,7 +649,6 @@ generated/pandas.Index.identical,../reference/api/pandas.Index.identical generated/pandas.Index.inferred_type,../reference/api/pandas.Index.inferred_type generated/pandas.Index.insert,../reference/api/pandas.Index.insert generated/pandas.Index.intersection,../reference/api/pandas.Index.intersection -generated/pandas.Index.is_all_dates,../reference/api/pandas.Index.is_all_dates generated/pandas.Index.is_boolean,../reference/api/pandas.Index.is_boolean generated/pandas.Index.is_categorical,../reference/api/pandas.Index.is_categorical generated/pandas.Index.is_floating,../reference/api/pandas.Index.is_floating @@ -680,15 +657,12 @@ generated/pandas.Index.isin,../reference/api/pandas.Index.isin generated/pandas.Index.is_integer,../reference/api/pandas.Index.is_integer generated/pandas.Index.is_interval,../reference/api/pandas.Index.is_interval generated/pandas.Index.is_lexsorted_for_tuple,../reference/api/pandas.Index.is_lexsorted_for_tuple -generated/pandas.Index.is_mixed,../reference/api/pandas.Index.is_mixed generated/pandas.Index.is_monotonic_decreasing,../reference/api/pandas.Index.is_monotonic_decreasing -generated/pandas.Index.is_monotonic,../reference/api/pandas.Index.is_monotonic generated/pandas.Index.is_monotonic_increasing,../reference/api/pandas.Index.is_monotonic_increasing generated/pandas.Index.isna,../reference/api/pandas.Index.isna generated/pandas.Index.isnull,../reference/api/pandas.Index.isnull generated/pandas.Index.is_numeric,../reference/api/pandas.Index.is_numeric generated/pandas.Index.is_object,../reference/api/pandas.Index.is_object -generated/pandas.Index.is_type_compatible,../reference/api/pandas.Index.is_type_compatible generated/pandas.Index.is_unique,../reference/api/pandas.Index.is_unique generated/pandas.Index.item,../reference/api/pandas.Index.item generated/pandas.Index.join,../reference/api/pandas.Index.join @@ -711,7 +685,6 @@ generated/pandas.Index.rename,../reference/api/pandas.Index.rename generated/pandas.Index.repeat,../reference/api/pandas.Index.repeat generated/pandas.Index.searchsorted,../reference/api/pandas.Index.searchsorted generated/pandas.Index.set_names,../reference/api/pandas.Index.set_names -generated/pandas.Index.set_value,../reference/api/pandas.Index.set_value generated/pandas.Index.shape,../reference/api/pandas.Index.shape generated/pandas.Index.shift,../reference/api/pandas.Index.shift generated/pandas.Index.size,../reference/api/pandas.Index.size @@ -726,11 +699,10 @@ generated/pandas.Index.summary,../reference/api/pandas.Index.summary generated/pandas.Index.symmetric_difference,../reference/api/pandas.Index.symmetric_difference generated/pandas.Index.take,../reference/api/pandas.Index.take generated/pandas.Index.T,../reference/api/pandas.Index.T -generated/pandas.Index.to_flat_index,../reference/api/pandas.Index.to_flat_index +generated/pandas.Index.to_flat_index,../reference/api/pandas.MultiIndex.to_flat_index generated/pandas.Index.to_frame,../reference/api/pandas.Index.to_frame generated/pandas.Index.to_list,../reference/api/pandas.Index.to_list generated/pandas.Index.tolist,../reference/api/pandas.Index.tolist -generated/pandas.Index.to_native_types,../reference/api/pandas.Index.to_native_types generated/pandas.Index.to_numpy,../reference/api/pandas.Index.to_numpy generated/pandas.Index.to_series,../reference/api/pandas.Index.to_series generated/pandas.Index.transpose,../reference/api/pandas.Index.transpose @@ -741,11 +713,11 @@ generated/pandas.Index.values,../reference/api/pandas.Index.values generated/pandas.Index.view,../reference/api/pandas.Index.view generated/pandas.Index.where,../reference/api/pandas.Index.where generated/pandas.infer_freq,../reference/api/pandas.infer_freq -generated/pandas.Interval.inclusive,../reference/api/pandas.Interval.inclusive +generated/pandas.Interval.closed,../reference/api/pandas.Interval.closed generated/pandas.Interval.closed_left,../reference/api/pandas.Interval.closed_left generated/pandas.Interval.closed_right,../reference/api/pandas.Interval.closed_right generated/pandas.Interval,../reference/api/pandas.Interval -generated/pandas.IntervalIndex.inclusive,../reference/api/pandas.IntervalIndex.inclusive +generated/pandas.IntervalIndex.closed,../reference/api/pandas.IntervalIndex.closed generated/pandas.IntervalIndex.contains,../reference/api/pandas.IntervalIndex.contains generated/pandas.IntervalIndex.from_arrays,../reference/api/pandas.IntervalIndex.from_arrays generated/pandas.IntervalIndex.from_breaks,../reference/api/pandas.IntervalIndex.from_breaks @@ -761,7 +733,6 @@ generated/pandas.IntervalIndex.mid,../reference/api/pandas.IntervalIndex.mid generated/pandas.IntervalIndex.overlaps,../reference/api/pandas.IntervalIndex.overlaps generated/pandas.IntervalIndex.right,../reference/api/pandas.IntervalIndex.right generated/pandas.IntervalIndex.set_closed,../reference/api/pandas.IntervalIndex.set_closed -generated/pandas.IntervalIndex.set_inclusive,../reference/api/pandas.IntervalIndex.set_inclusive generated/pandas.IntervalIndex.to_tuples,../reference/api/pandas.IntervalIndex.to_tuples generated/pandas.IntervalIndex.values,../reference/api/pandas.IntervalIndex.values generated/pandas.Interval.left,../reference/api/pandas.Interval.left @@ -773,7 +744,6 @@ generated/pandas.Interval.overlaps,../reference/api/pandas.Interval.overlaps generated/pandas.interval_range,../reference/api/pandas.interval_range generated/pandas.Interval.right,../reference/api/pandas.Interval.right generated/pandas.io.formats.style.Styler.apply,../reference/api/pandas.io.formats.style.Styler.apply -generated/pandas.io.formats.style.Styler.applymap,../reference/api/pandas.io.formats.style.Styler.applymap generated/pandas.io.formats.style.Styler.background_gradient,../reference/api/pandas.io.formats.style.Styler.background_gradient generated/pandas.io.formats.style.Styler.bar,../reference/api/pandas.io.formats.style.Styler.bar generated/pandas.io.formats.style.Styler.clear,../reference/api/pandas.io.formats.style.Styler.clear @@ -912,7 +882,6 @@ generated/pandas.read_csv,../reference/api/pandas.read_csv generated/pandas.read_excel,../reference/api/pandas.read_excel generated/pandas.read_feather,../reference/api/pandas.read_feather generated/pandas.read_fwf,../reference/api/pandas.read_fwf -generated/pandas.read_gbq,../reference/api/pandas.read_gbq generated/pandas.read_hdf,../reference/api/pandas.read_hdf generated/pandas.read,../reference/api/pandas.read generated/pandas.read_json,../reference/api/pandas.read_json @@ -934,7 +903,6 @@ generated/pandas.Series.aggregate,../reference/api/pandas.Series.aggregate generated/pandas.Series.align,../reference/api/pandas.Series.align generated/pandas.Series.all,../reference/api/pandas.Series.all generated/pandas.Series.any,../reference/api/pandas.Series.any -generated/pandas.Series.append,../reference/api/pandas.Series.append generated/pandas.Series.apply,../reference/api/pandas.Series.apply generated/pandas.Series.argmax,../reference/api/pandas.Series.argmax generated/pandas.Series.argmin,../reference/api/pandas.Series.argmin @@ -955,7 +923,6 @@ generated/pandas.Series.between,../reference/api/pandas.Series.between generated/pandas.Series.between_time,../reference/api/pandas.Series.between_time generated/pandas.Series.bfill,../reference/api/pandas.Series.bfill generated/pandas.Series.blocks,../reference/api/pandas.Series.blocks -generated/pandas.Series.bool,../reference/api/pandas.Series.bool generated/pandas.Series.cat.add_categories,../reference/api/pandas.Series.cat.add_categories generated/pandas.Series.cat.as_ordered,../reference/api/pandas.Series.cat.as_ordered generated/pandas.Series.cat.as_unordered,../reference/api/pandas.Series.cat.as_unordered @@ -1083,7 +1050,6 @@ generated/pandas.Series.interpolate,../reference/api/pandas.Series.interpolate generated/pandas.Series.is_copy,../reference/api/pandas.Series.is_copy generated/pandas.Series.isin,../reference/api/pandas.Series.isin generated/pandas.Series.is_monotonic_decreasing,../reference/api/pandas.Series.is_monotonic_decreasing -generated/pandas.Series.is_monotonic,../reference/api/pandas.Series.is_monotonic generated/pandas.Series.is_monotonic_increasing,../reference/api/pandas.Series.is_monotonic_increasing generated/pandas.Series.isna,../reference/api/pandas.Series.isna generated/pandas.Series.isnull,../reference/api/pandas.Series.isnull @@ -1091,8 +1057,6 @@ generated/pandas.Series.is_unique,../reference/api/pandas.Series.is_unique generated/pandas.Series.item,../reference/api/pandas.Series.item generated/pandas.Series.items,../reference/api/pandas.Series.items generated/pandas.Series.__iter__,../reference/api/pandas.Series.__iter__ -generated/pandas.Series.iteritems,../reference/api/pandas.Series.iteritems -generated/pandas.Series.ix,../reference/api/pandas.Series.ix generated/pandas.Series.keys,../reference/api/pandas.Series.keys generated/pandas.Series.kurt,../reference/api/pandas.Series.kurt generated/pandas.Series.kurtosis,../reference/api/pandas.Series.kurtosis @@ -1101,7 +1065,6 @@ generated/pandas.Series.last_valid_index,../reference/api/pandas.Series.last_val generated/pandas.Series.le,../reference/api/pandas.Series.le generated/pandas.Series.loc,../reference/api/pandas.Series.loc generated/pandas.Series.lt,../reference/api/pandas.Series.lt -generated/pandas.Series.mad,../reference/api/pandas.Series.mad generated/pandas.Series.map,../reference/api/pandas.Series.map generated/pandas.Series.mask,../reference/api/pandas.Series.mask generated/pandas.Series.max,../reference/api/pandas.Series.max @@ -1143,7 +1106,6 @@ generated/pandas.Series.ptp,../reference/api/pandas.Series.ptp generated/pandas.Series.quantile,../reference/api/pandas.Series.quantile generated/pandas.Series.radd,../reference/api/pandas.Series.radd generated/pandas.Series.rank,../reference/api/pandas.Series.rank -generated/pandas.Series.ravel,../reference/api/pandas.Series.ravel generated/pandas.Series.rdiv,../reference/api/pandas.Series.rdiv generated/pandas.Series.rdivmod,../reference/api/pandas.Series.rdivmod generated/pandas.Series.real,../reference/api/pandas.Series.real @@ -1175,7 +1137,6 @@ generated/pandas.Series.shape,../reference/api/pandas.Series.shape generated/pandas.Series.shift,../reference/api/pandas.Series.shift generated/pandas.Series.size,../reference/api/pandas.Series.size generated/pandas.Series.skew,../reference/api/pandas.Series.skew -generated/pandas.Series.slice_shift,../reference/api/pandas.Series.slice_shift generated/pandas.Series.sort_index,../reference/api/pandas.Series.sort_index generated/pandas.Series.sort_values,../reference/api/pandas.Series.sort_values generated/pandas.Series.sparse.density,../reference/api/pandas.Series.sparse.density @@ -1243,7 +1204,6 @@ generated/pandas.Series.str.zfill,../reference/api/pandas.Series.str.zfill generated/pandas.Series.sub,../reference/api/pandas.Series.sub generated/pandas.Series.subtract,../reference/api/pandas.Series.subtract generated/pandas.Series.sum,../reference/api/pandas.Series.sum -generated/pandas.Series.swapaxes,../reference/api/pandas.Series.swapaxes generated/pandas.Series.swaplevel,../reference/api/pandas.Series.swaplevel generated/pandas.Series.tail,../reference/api/pandas.Series.tail generated/pandas.Series.take,../reference/api/pandas.Series.take @@ -1270,7 +1230,6 @@ generated/pandas.Series.transform,../reference/api/pandas.Series.transform generated/pandas.Series.transpose,../reference/api/pandas.Series.transpose generated/pandas.Series.truediv,../reference/api/pandas.Series.truediv generated/pandas.Series.truncate,../reference/api/pandas.Series.truncate -generated/pandas.Series.tshift,../reference/api/pandas.Series.tshift generated/pandas.Series.tz_convert,../reference/api/pandas.Series.tz_convert generated/pandas.Series.tz_localize,../reference/api/pandas.Series.tz_localize generated/pandas.Series.unique,../reference/api/pandas.Series.unique @@ -1280,7 +1239,6 @@ generated/pandas.Series.valid,../reference/api/pandas.Series.valid generated/pandas.Series.value_counts,../reference/api/pandas.Series.value_counts generated/pandas.Series.values,../reference/api/pandas.Series.values generated/pandas.Series.var,../reference/api/pandas.Series.var -generated/pandas.Series.view,../reference/api/pandas.Series.view generated/pandas.Series.where,../reference/api/pandas.Series.where generated/pandas.Series.xs,../reference/api/pandas.Series.xs generated/pandas.set_option,../reference/api/pandas.set_option @@ -1344,8 +1302,6 @@ generated/pandas.Timestamp.daysinmonth,../reference/api/pandas.Timestamp.daysinm generated/pandas.Timestamp.dst,../reference/api/pandas.Timestamp.dst generated/pandas.Timestamp.floor,../reference/api/pandas.Timestamp.floor generated/pandas.Timestamp.fold,../reference/api/pandas.Timestamp.fold -generated/pandas.Timestamp.freq,../reference/api/pandas.Timestamp.freq -generated/pandas.Timestamp.freqstr,../reference/api/pandas.Timestamp.freqstr generated/pandas.Timestamp.fromisoformat,../reference/api/pandas.Timestamp.fromisoformat generated/pandas.Timestamp.fromordinal,../reference/api/pandas.Timestamp.fromordinal generated/pandas.Timestamp.fromtimestamp,../reference/api/pandas.Timestamp.fromtimestamp @@ -1413,3 +1369,66 @@ generated/pandas.wide_to_long,../reference/api/pandas.wide_to_long # Cached searches reference/api/pandas.DataFrame.from_csv,pandas.read_csv + +# GroupBy -> DataFrameGroupBy +reference/api/pandas.core.groupby.GroupBy.__iter__,pandas.core.groupby.DataFrameGroupBy.__iter__ +reference/api/pandas.core.groupby.GroupBy.agg,pandas.core.groupby.DataFrameGroupBy.agg +reference/api/pandas.core.groupby.GroupBy.aggregate,pandas.core.groupby.DataFrameGroupBy.aggregate +reference/api/pandas.core.groupby.GroupBy.all,pandas.core.groupby.DataFrameGroupBy.all +reference/api/pandas.core.groupby.GroupBy.any,pandas.core.groupby.DataFrameGroupBy.any +reference/api/pandas.core.groupby.GroupBy.apply,pandas.core.groupby.DataFrameGroupBy.apply +reference/api/pandas.core.groupby.GroupBy.bfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.count,pandas.core.groupby.DataFrameGroupBy.count +reference/api/pandas.core.groupby.GroupBy.cumcount,pandas.core.groupby.DataFrameGroupBy.cumcount +reference/api/pandas.core.groupby.GroupBy.cummax,pandas.core.groupby.DataFrameGroupBy.cummax +reference/api/pandas.core.groupby.GroupBy.cummin,pandas.core.groupby.DataFrameGroupBy.cummin +reference/api/pandas.core.groupby.GroupBy.cumprod,pandas.core.groupby.DataFrameGroupBy.cumprod +reference/api/pandas.core.groupby.GroupBy.cumsum,pandas.core.groupby.DataFrameGroupBy.cumsum +reference/api/pandas.core.groupby.GroupBy.ffill,pandas.core.groupby.DataFrameGroupBy.ffill +reference/api/pandas.core.groupby.GroupBy.first,pandas.core.groupby.DataFrameGroupBy.first +reference/api/pandas.core.groupby.GroupBy.get_group,pandas.core.groupby.DataFrameGroupBy.get_group +reference/api/pandas.core.groupby.GroupBy.groups,pandas.core.groupby.DataFrameGroupBy.groups +reference/api/pandas.core.groupby.GroupBy.head,pandas.core.groupby.DataFrameGroupBy.head +reference/api/pandas.core.groupby.GroupBy.indices,pandas.core.groupby.DataFrameGroupBy.indices +reference/api/pandas.core.groupby.GroupBy.last,pandas.core.groupby.DataFrameGroupBy.last +reference/api/pandas.core.groupby.GroupBy.max,pandas.core.groupby.DataFrameGroupBy.max +reference/api/pandas.core.groupby.GroupBy.mean,pandas.core.groupby.DataFrameGroupBy.mean +reference/api/pandas.core.groupby.GroupBy.median,pandas.core.groupby.DataFrameGroupBy.median +reference/api/pandas.core.groupby.GroupBy.min,pandas.core.groupby.DataFrameGroupBy.min +reference/api/pandas.core.groupby.GroupBy.ngroup,pandas.core.groupby.DataFrameGroupBy.ngroup +reference/api/pandas.core.groupby.GroupBy.nth,pandas.core.groupby.DataFrameGroupBy.nth +reference/api/pandas.core.groupby.GroupBy.ohlc,pandas.core.groupby.DataFrameGroupBy.ohlc +reference/api/pandas.core.groupby.GroupBy.pct_change,pandas.core.groupby.DataFrameGroupBy.pct_change +reference/api/pandas.core.groupby.GroupBy.pipe,pandas.core.groupby.DataFrameGroupBy.pipe +reference/api/pandas.core.groupby.GroupBy.prod,pandas.core.groupby.DataFrameGroupBy.prod +reference/api/pandas.core.groupby.GroupBy.rank,pandas.core.groupby.DataFrameGroupBy.rank +reference/api/pandas.core.groupby.GroupBy.sem,pandas.core.groupby.DataFrameGroupBy.sem +reference/api/pandas.core.groupby.GroupBy.size,pandas.core.groupby.DataFrameGroupBy.size +reference/api/pandas.core.groupby.GroupBy.std,pandas.core.groupby.DataFrameGroupBy.std +reference/api/pandas.core.groupby.GroupBy.sum,pandas.core.groupby.DataFrameGroupBy.sum +reference/api/pandas.core.groupby.GroupBy.tail,pandas.core.groupby.DataFrameGroupBy.tail +reference/api/pandas.core.groupby.GroupBy.transform,pandas.core.groupby.DataFrameGroupBy.transform +reference/api/pandas.core.groupby.GroupBy.var,pandas.core.groupby.DataFrameGroupBy.var + +# Renamed or alias doc page was removed +reference/api/pandas.DataFrame.subtract,pandas.DataFrame.sub +reference/api/pandas.DataFrame.multiply,pandas.DataFrame.mul +reference/api/pandas.DataFrame.divide,pandas.DataFrame.div +reference/api/pandas.Series.subtract,pandas.Series.sub +reference/api/pandas.Series.multiply,pandas.Series.mul +reference/api/pandas.Series.divide,pandas.Series.div +reference/api/pandas.Series.tolist,pandas.Series.to_list +reference/api/pandas.Series.transpose,pandas.Series.T +reference/api/pandas.Index.transpose,pandas.Index.T +reference/api/pandas.Index.notnull,pandas.Index.notna +reference/api/pandas.Index.tolist,pandas.Index.to_list +reference/api/pandas.core.groupby.DataFrameGroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.groupby.GroupBy.backfill,pandas.core.groupby.DataFrameGroupBy.bfill +reference/api/pandas.core.resample.Resampler.backfill,pandas.core.resample.Resampler.bfill + +# EWM -> ExponentialMovingWindow +reference/api/pandas.core.window.ewm.EWM.corr,pandas.core.window.ewm.ExponentialMovingWindow.corr +reference/api/pandas.core.window.ewm.EWM.cov,pandas.core.window.ewm.ExponentialMovingWindow.cov +reference/api/pandas.core.window.ewm.EWM.mean,pandas.core.window.ewm.ExponentialMovingWindow.mean +reference/api/pandas.core.window.ewm.EWM.std,pandas.core.window.ewm.ExponentialMovingWindow.std +reference/api/pandas.core.window.ewm.EWM.var,pandas.core.window.ewm.ExponentialMovingWindow.var diff --git a/doc/scripts/eval_performance.py b/doc/scripts/eval_performance.py new file mode 100644 index 0000000000000..0383d4d598d55 --- /dev/null +++ b/doc/scripts/eval_performance.py @@ -0,0 +1,108 @@ +from timeit import repeat as timeit + +import numpy as np +import seaborn as sns + +from pandas import DataFrame + +setup_common = """from pandas import DataFrame +import numpy as np +df = DataFrame(np.random.randn(%d, 3), columns=list('abc')) +%s""" + +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + + +def bench_with(n, times=10, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + f"df.eval(s, engine={engine!r})", + setup=setup_common % (n, setup_with), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=20, repeat=3, engine="numexpr"): + return ( + np.array( + timeit( + f"df.query(s, engine={engine!r})", + setup=setup_common % (n, setup_subset), + repeat=repeat, + number=times, + ) + ) + / times + ) + + +def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev["size"] = qu["size"] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose & (i % 10 == 0): + print(f"engine: {engine!r}, i == {i:d}") + ev_times = bench_with(n, times=1, repeat=1, engine=engine) + ev.loc[i, engine] = np.mean(ev_times) + qu_times = bench_subset(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = np.mean(qu_times) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None) -> None: + from matplotlib.pyplot import figure + + sns.set() + sns.set_palette("Set2") + + fig = figure(figsize=(4, 3), dpi=120) + ax = fig.add_subplot(111) + + for engine in engines: + ax.loglog(df["size"], df[engine], label=engine, lw=2) + + ax.set_xlabel("Number of Rows") + ax.set_ylabel("Time (s)") + ax.set_title(title) + ax.legend(loc="best") + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) + + +if __name__ == "__main__": + import os + + pandas_dir = os.path.dirname( + os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + ) + static_path = os.path.join(pandas_dir, "doc", "source", "_static") + + join = lambda p: os.path.join(static_path, p) + + fn = join("eval-query-perf-data.h5") + + engines = "python", "numexpr" + + ev, qu = bench(verbose=True) # only this one + + plot_perf(ev, engines, "DataFrame.eval()", filename=join("eval-perf.png")) + plot_perf(qu, engines, "DataFrame.query()", filename=join("query-perf.png")) diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index 84eafa308175c..b02311eb66080 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -10,6 +10,14 @@ font-size: 0.9rem; } +.gs-data-header { + background-color: var(--pst-color-on-surface); +} + +.gs-data-list { + background-color: var(--pst-color-on-background); +} + .gs-data-title .badge { margin: 10px; padding: 5px; @@ -57,45 +65,33 @@ margin-top: -5px; } .gs-callout-remember { - border-left-color: #f0ad4e; + border-left-color: var(--pst-color-secondary); align-items: center; font-size: 1.2rem; } .gs-callout-remember h4 { - color: #f0ad4e; + color: var(--pst-color-secondary); } /* reference to user guide */ .gs-torefguide { align-items: center; font-size: 0.9rem; + background-color: var(--pst-color-on-background); + border-radius: .25rem; + padding: 2px; } .gs-torefguide .badge { - background-color: #130654; - margin: 10px 10px 10px 0px; + background-color: var(--pst-color-primary); + margin: 10px 10px 10px 10px; padding: 5px; } -.gs-torefguide a { - margin-left: 5px; - color: #130654; - border-bottom: 1px solid #FFCA00f3; - box-shadow: 0px -10px 0px #FFCA00f3 inset; -} - .gs-torefguide p { margin-top: 1rem; } -.gs-torefguide a:hover { - margin-left: 5px; - color: grey; - text-decoration: none; - border-bottom: 1px solid #b2ff80f3; - box-shadow: 0px -10px 0px #b2ff80f3 inset; -} - /* question-task environment */ ul.task-bullet, ol.custom-bullet{ @@ -113,14 +109,14 @@ ul.task-bullet > li:before { margin-left:-2em; background-position:center; background-repeat:no-repeat; - background-color: #130654; + background-color: var(--pst-color-primary); border-radius: 50%; background-size:100%; background-image:url('../question_mark_noback.svg'); } ul.task-bullet > li { - border-left: 1px solid #130654; + border-left: 1px solid var(--pst-color-primary); padding-left:1em; } @@ -132,7 +128,7 @@ ul.task-bullet > li > p:first-child { /* Getting started index page */ .comparison-card { - background:#FFF; + background-color: var(--pst-color-background); border-radius:0; padding: 30px 10px 10px 10px; margin: 10px 0px; @@ -142,19 +138,26 @@ ul.task-bullet > li > p:first-child { margin: 0px; } -.comparison-card .card-img-top { +.comparison-card .sd-card-img-top { margin: 10px; margin-bottom: 20px; - height: 72px; + height: 52px; + background: none !important; } -.comparison-card-excel .card-img-top, .comparison-card-stata .card-img-top, .comparison-card-sas .card-img-top { - height: 52px; +.comparison-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.comparison-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; } .comparison-card .card-footer { border: none; - background-color:white; + background-color: var(--pst-color-background); } .install-block { @@ -163,25 +166,34 @@ ul.task-bullet > li > p:first-child { .install-card .card-header { border: none; - background-color:white; + background-color: transparent; padding: 1rem 1rem 0rem 1rem; } .install-card .card-header p.card-text { - color: #150458; font-size: 1.1rem; font-weight: bold; } .install-card .card-footer { border: none; - background-color:white; + background-color: transparent; } .install-card pre { margin: 0 1em 1em 1em; } +.install-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.install-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; +} + .custom-button { background-color:#DCDCDC; border: none; @@ -236,16 +248,18 @@ ul.task-bullet > li > p:first-child { } .tutorial-card .card-header { + --bs-card-cap-color: var(--pst-color-text-base); cursor: pointer; - background-color: white; + background-color: var(--pst-color-surface); + border: 1px solid var(--pst-color-border) } .tutorial-card .card-body { - background-color: #F0F0F0; + background-color: var(--pst-color-on-background); } .tutorial-card .badge { - background-color: #130654; + background-color: var(--pst-color-primary); margin: 10px 10px 10px 10px; padding: 5px; } @@ -254,8 +268,9 @@ ul.task-bullet > li > p:first-child { margin: 0px; } + .tutorial-card .gs-badge-link a { - color: white; + color: var(--pst-color-primary-text); text-decoration: none; } diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index 452c7d20ff5df..1145177898737 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -5,40 +5,48 @@ --pst-color-info: 23, 162, 184; } +table { + width: auto; /* Override fit-content which breaks Styler user guide ipynb */ +} + /* Main index page overview cards */ .intro-card { - background: #fff; - border-radius: 0; padding: 30px 10px 20px 10px; - margin: 10px 0px; } -.intro-card p.card-text { - margin: 0px; -} - -.intro-card .card-img-top { +.intro-card .sd-card-img-top { margin: 10px; height: 52px; + background: none !important; } -.intro-card .card-header { - border: none; - background-color:white; - color: #150458 !important; +.intro-card .sd-card-title { + color: var(--pst-color-primary); font-size: var(--pst-font-size-h5); - font-weight: bold; - padding: 2.5rem 0rem 0.5rem 0rem; + padding: 1rem 0rem 0.5rem 0rem; } -.intro-card .card-footer { - border: none; - background-color:white; +.intro-card .sd-card-footer { + border: none !important; } -.intro-card .card-footer p.card-text{ +.intro-card .sd-card-footer p.sd-card-text { max-width: 220px; margin-left: auto; margin-right: auto; } + +.intro-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.intro-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; +} + +.card, .card img { + background-color: var(--pst-color-background); +} diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png deleted file mode 100644 index d86018363ffdc..0000000000000 Binary files a/doc/source/_static/eval-perf-small.png and /dev/null differ diff --git a/doc/source/_static/eval-perf.png b/doc/source/_static/eval-perf.png index 14c69c1b85d9e..ed92337c1d995 100644 Binary files a/doc/source/_static/eval-perf.png and b/doc/source/_static/eval-perf.png differ diff --git a/doc/source/_static/index_api.svg b/doc/source/_static/index_api.svg index 70bf0d3504b1a..69f7ba1d2d114 100644 --- a/doc/source/_static/index_api.svg +++ b/doc/source/_static/index_api.svg @@ -64,29 +64,29 @@ inkscape:connector-curvature="0" id="path899" d="M 324.96812,187.09499 H 303.0455 v 72.1639 h 22.67969" - style="fill:none;stroke:#150458;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> + style="fill:none;stroke:#459DB9;stroke-width:10;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> diff --git a/doc/source/_static/index_getting_started.svg b/doc/source/_static/index_getting_started.svg index d00e462427193..2d36622cb7e55 100644 --- a/doc/source/_static/index_getting_started.svg +++ b/doc/source/_static/index_getting_started.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(2.9219487,-8.5995374)"> diff --git a/doc/source/_static/index_user_guide.svg b/doc/source/_static/index_user_guide.svg index a567103af5918..bd170535170a3 100644 --- a/doc/source/_static/index_user_guide.svg +++ b/doc/source/_static/index_user_guide.svg @@ -58,7 +58,7 @@ id="layer1" transform="translate(141.8903,-20.32143)"> + + + + + + + + \ No newline at end of file diff --git a/doc/source/_static/logo_sql.svg b/doc/source/_static/logo_sql.svg index 4a5b7d0b1b943..38b3b2c726214 100644 --- a/doc/source/_static/logo_sql.svg +++ b/doc/source/_static/logo_sql.svg @@ -58,10 +58,10 @@ d="m 18.846017,1.608 c -0.497,-0.326 -1.193,-0.615 -2.069,-0.858 -1.742,-0.484 -4.05,-0.75 -6.498,-0.75 -2.4480004,0 -4.7560004,0.267 -6.4980004,0.75 -0.877,0.243 -1.573,0.532 -2.069,0.858 -0.619,0.407 -0.93299996,0.874 -0.93299996,1.391 v 12 c 0,0.517 0.31399996,0.985 0.93299996,1.391 0.497,0.326 1.193,0.615 2.069,0.858 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.877,-0.243 1.573,-0.532 2.069,-0.858 0.619,-0.406 0.933,-0.874 0.933,-1.391 v -12 c 0,-0.517 -0.314,-0.985 -0.933,-1.391 z M 4.0490166,1.713 c 1.658,-0.46 3.87,-0.714 6.2300004,-0.714 2.36,0 4.573,0.254 6.23,0.714 1.795,0.499 2.27,1.059 2.27,1.286 0,0.227 -0.474,0.787 -2.27,1.286 -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 0,-0.227 0.474,-0.787 2.27,-1.286 z M 16.509017,16.285 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 v -2.566 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.751 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 8.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z m 0,-4 c -1.658,0.46 -3.87,0.714 -6.23,0.714 -2.3600004,0 -4.5730004,-0.254 -6.2300004,-0.714 -1.795,-0.499 -2.27,-1.059 -2.27,-1.286 V 4.433 c 0.492,0.309 1.164,0.583 2.002,0.816 1.742,0.484 4.05,0.75 6.4980004,0.75 2.448,0 4.756,-0.267 6.498,-0.75 0.838,-0.233 1.511,-0.507 2.002,-0.816 v 2.566 c 0,0.227 -0.474,0.787 -2.27,1.286 z" id="path2" inkscape:connector-curvature="0" - style="fill:#000000" /> + style="fill:#888888" /> str: # this method gives an error/warning for the accessors, therefore # overriding it (accessor has no arguments) return "" @@ -581,7 +595,7 @@ class AccessorCallableDocumenter(AccessorLevelDocumenter, MethodDocumenter): priority = 0.5 def format_name(self): - return MethodDocumenter.format_name(self).rstrip(".__call__") + return MethodDocumenter.format_name(self).removesuffix(".__call__") class PandasAutosummary(Autosummary): @@ -626,7 +640,7 @@ def get_items(self, names): # based on numpy doc/source/conf.py -def linkcode_resolve(domain, info): +def linkcode_resolve(domain, info) -> str | None: """ Determine the URL corresponding to Python object """ @@ -653,12 +667,20 @@ def linkcode_resolve(domain, info): try: fn = inspect.getsourcefile(inspect.unwrap(obj)) except TypeError: - fn = None + try: # property + fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) + except (AttributeError, TypeError): + fn = None if not fn: return None try: source, lineno = inspect.getsourcelines(obj) + except TypeError: + try: # property + source, lineno = inspect.getsourcelines(obj.fget) + except (AttributeError, TypeError): + lineno = None except OSError: lineno = None @@ -680,12 +702,12 @@ def linkcode_resolve(domain, info): # remove the docstring of the flags attribute (inherited from numpy ndarray) # because these give doc build errors (see GH issue 5331) -def remove_flags_docstring(app, what, name, obj, options, lines): +def remove_flags_docstring(app, what, name, obj, options, lines) -> None: if what == "attribute" and name.endswith(".flags"): del lines[:] -def process_class_docstrings(app, what, name, obj, options, lines): +def process_class_docstrings(app, what, name, obj, options, lines) -> None: """ For those classes for which we use :: @@ -737,7 +759,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): ] -def process_business_alias_docstrings(app, what, name, obj, options, lines): +def process_business_alias_docstrings(app, what, name, obj, options, lines) -> None: """ Starting with sphinx 3.4, the "autodoc-process-docstring" event also gets called for alias classes. This results in numpydoc adding the @@ -760,7 +782,7 @@ def process_business_alias_docstrings(app, what, name, obj, options, lines): suppress_warnings.append("ref.ref") -def rstjinja(app, docname, source): +def rstjinja(app, docname, source) -> None: """ Render our pages as a jinja template for fancy templating goodness. """ @@ -773,7 +795,7 @@ def rstjinja(app, docname, source): source[0] = rendered -def setup(app): +def setup(app) -> None: app.connect("source-read", rstjinja) app.connect("autodoc-process-docstring", remove_flags_docstring) app.connect("autodoc-process-docstring", process_class_docstrings) @@ -783,3 +805,46 @@ def setup(app): app.add_autodocumenter(AccessorMethodDocumenter) app.add_autodocumenter(AccessorCallableDocumenter) app.add_directive("autosummary", PandasAutosummary) + + +# Ignore list for broken links,found in CI run checks for broken-linkcheck.yml + +linkcheck_ignore = [ + "^http://$", + "^https://$", + *[ + re.escape(link) + for link in [ + "http://scatterci.github.io/pydata/pandas", + "http://specs.frictionlessdata.io/json-table-schema/", + "https://crates.io/crates/calamine", + "https://devguide.python.org/setup/#macos", + "https://en.wikipedia.org/wiki/Imputation_statistics", + "https://en.wikipedia.org/wiki/Imputation_(statistics", + "https://github.com/noatamir/pandas-dev", + "https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1", + "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/generic.py#L568", + "https://github.com/pandas-dev/pandas/blob/v0.20.2/pandas/core/frame.py#L1495", + "https://github.com/pandas-dev/pandas/issues/174151", + "https://gitpod.io/#https://github.com/USERNAME/pandas", + "https://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/", + "https://matplotlib.org/api/axes_api.html#matplotlib.axes.Axes.table", + "https://nipunbatra.github.io/blog/visualisation/2013/05/01/aggregation-timeseries.html", + "https://nbviewer.ipython.org/gist/metakermit/5720498", + "https://numpy.org/doc/stable/user/basics.byteswapping.html", + "https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking", + "https://pandas.pydata.org/pandas-docs/stable/ecosystem.html", + "https://sqlalchemy.readthedocs.io/en/latest/dialects/index.html", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245912.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000214639.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002283942.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a000245965.htm", + "https://support.sas.com/documentation/cdl/en/imlug/66845/HTML/default/viewer.htm#imlug_langref_sect455.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002284668.htm", + "https://support.sas.com/documentation/cdl/en/lrdict/64316/HTML/default/viewer.htm#a002978282.htm", + "https://wesmckinney.com/blog/update-on-upcoming-pandas-v0-10-new-file-parser-other-performance-wins/", + "https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022", + "pandas.zip", + ] + ], +] diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst new file mode 100644 index 0000000000000..ab8294b8f135a --- /dev/null +++ b/doc/source/development/community.rst @@ -0,0 +1,123 @@ +.. _community: + +===================== +Contributor community +===================== + +pandas is a community-driven open source project developed by a large group +of `contributors `_ +and a smaller group of `maintainers `_. +The pandas leadership has made a strong commitment to creating an open, +inclusive, and positive community. Please read the pandas `Code of Conduct +`_ for guidance on how to +interact with others in a way that makes the community thrive. + +We offer several meetings and communication channels to share knowledge and +connect with others within the pandas community. + +Community meeting +----------------- + +The pandas Community Meeting is a regular sync meeting for the project's +maintainers which is open to the community. Everyone is welcome to attend and +contribute to conversations. + +The meetings take place on the second and fourth Wednesdays of each month at 18:00 UTC. + +The minutes of past meetings are available in `this Google Document `__. + + +New contributor meeting +----------------------- + +On the third Wednesday of the month, we hold meetings to welcome and support +new contributors in our community. + +| 👋 you all are invited +| 💬 everyone can present (add yourself to the hackMD agenda) +| 👀 anyone can sit in and listen + +Attendees are new and experienced contributors, as well as a few maintainers. +We aim to answer questions about getting started, or help with work in +progress when possible, as well as get to know each other and share our +learnings and experiences. + +The agenda for the next meeting and minutes of past meetings are available in +`this HackMD `__. + +Calendar +-------- + +This calendar shows all the community meetings. Our community meetings are +ideal for anyone wanting to contribute to pandas, or just curious to know how +current development is going. + +.. raw:: html + + + +You can subscribe to this calendar with the following links: + +* `iCal `__ +* `Google calendar `__ + +Additionally, we'll sometimes have one-off meetings on specific topics. +These will be published on the same calendar. + +`GitHub issue tracker `_ +---------------------------------------------------------------------- + +The pandas contributor community conducts conversations mainly via this channel. +Any community member can open issues to: + +- Report bugs, e.g. "I noticed the behavior of a certain function is + incorrect" +- Request features, e.g. "I would like this error message to be more readable" +- Request documentation improvements, e.g. "I found this section unclear" +- Ask questions, e.g. "I noticed the behavior of a certain function + changed between versions. Is this expected?". + + Ideally, your questions should be related to how pandas works rather + than how you use pandas. `StackOverflow `_ is + better suited for answering usage questions, and we ask that all usage + questions are first asked on StackOverflow. Thank you for respecting our + time and wishes. 🙇 + +Maintainers and frequent contributors might also open issues to discuss the +ongoing development of the project. For example: + +- Report issues with the CI, GitHub Actions, or the performance of pandas +- Open issues relating to the internals +- Start roadmap discussion aligning on proposals for what to do in future + releases or changes to the API. +- Open issues relating to the project's website, logo, or governance + +The developer mailing list +-------------------------- + +The pandas mailing list `pandas-dev@python.org `_ is used for long form +conversations and to engage people in the wider community who might not +be active on the issue tracker but we would like to include in discussions. + +Join the mailing list and view the archives `here `_. + +.. _community.slack: + +Community slack +--------------- + +We have a chat platform for contributors, maintainers and potential +contributors. This is not a space for user questions, rather for questions about +contributing to pandas. The slack is a private space, specifically meant for +people who are hesitant to bring up their questions or ideas on a large public +mailing list or GitHub. + +If this sounds like the right place for you, you are welcome to join using +`this link `_! +Please remember to follow our `Code of Conduct `_, +and be aware that our admins are monitoring for irrelevant messages and will remove folks who use +our +slack for spam, advertisements and messages not related to the pandas contributing community. And +please remember that slack is not meant to replace the mailing list or issue tracker - all important +announcements and conversations should still happen there. diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index e76197e302ca4..4d99f282aa695 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -9,215 +9,181 @@ Contributing to pandas .. contents:: Table of contents: :local: -Where to start? -=============== All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. -If you are brand new to pandas or open-source development, we recommend going -through the `GitHub "issues" tab `_ -to find issues that interest you. There are a number of issues listed under `Docs -`_ -and `good first issue -`_ -where you could start out. Once you've found an interesting issue, you can -return here to get your development environment setup. - -When you start working on an issue, it's a good idea to assign the issue to yourself, -so nobody else duplicates the work on it. GitHub restricts assigning issues to maintainers -of the project only. In most projects, and until recently in pandas, contributors added a -comment letting others know they are working on an issue. While this is ok, you need to -check each issue individually, and it's not possible to find the unassigned ones. - -For this reason, we implemented a workaround consisting of adding a comment with the exact -text ``take``. When you do it, a GitHub action will automatically assign you the issue -(this will take seconds, and may require refreshing the page to see it). -By doing this, it's possible to filter the list of issues and find only the unassigned ones. - -So, a good way to find an issue to start contributing to pandas is to check the list of -`unassigned good first issues `_ -and assign yourself one you like by writing a comment with the exact text ``take``. - -If for whatever reason you are not able to continue working with the issue, please try to -unassign it, so other people know it's available again. You can check the list of -assigned issues, since people may not be working in them anymore. If you want to work on one -that is assigned, feel free to kindly ask the current assignee if you can take it -(please allow at least a week of inactivity before considering work in the issue discontinued). - -Feel free to ask questions on the `mailing list -`_ or on `Gitter`_. - .. _contributing.bug_reports: Bug reports and enhancement requests ==================================== -Bug reports are an important part of making pandas more stable. Having a complete bug report -will allow others to reproduce the bug and provide insight into fixing. See -`this stackoverflow article `_ and -`this blogpost `_ -for tips on writing a good bug report. - -Trying the bug-producing code out on the *main* branch is often a worthwhile exercise -to confirm the bug still exists. It is also worth searching existing bug reports and pull requests -to see if the issue has already been reported and/or fixed. +Bug reports and enhancement requests are an important part of making pandas more stable and +are curated though Github issues. When reporting an issue or request, please select the `appropriate +category and fill out the issue form fully `_ +to ensure others and the core development team can fully understand the scope of the issue. -Bug reports must: - -#. Include a short, self-contained Python snippet reproducing the problem. - You can format the code nicely by using `GitHub Flavored Markdown - `_:: +The issue will then show up to the pandas community and be open to comments/ideas from others. - ```python - >>> from pandas import DataFrame - >>> df = DataFrame(...) - ... - ``` +Finding an issue to contribute to +================================= -#. Include the full version string of pandas and its dependencies. You can use the built-in function:: +If you are brand new to pandas or open-source development, we recommend searching +the `GitHub "issues" tab `_ +to find issues that interest you. Unassigned issues labeled `Docs +`_ +and `good first issue +`_ +are typically good for newer contributors. - >>> import pandas as pd - >>> pd.show_versions() +Once you've found an interesting issue, it's a good idea to assign the issue to yourself, +so nobody else duplicates the work on it. On the Github issue, a comment with the exact +text ``take`` to automatically assign you the issue +(this will take seconds and may require refreshing the page to see it). -#. Explain why the current behavior is wrong/not desired and what you expect instead. +If for whatever reason you are not able to continue working with the issue, please +unassign it, so other people know it's available again. You can check the list of +assigned issues, since people may not be working in them anymore. If you want to work on one +that is assigned, feel free to kindly ask the current assignee if you can take it +(please allow at least a week of inactivity before considering work in the issue discontinued). -The issue will then show up to the pandas community and be open to comments/ideas from others. +We have several :ref:`contributor community ` communication channels, which you are +welcome to join, and ask questions as you figure things out. Among them are regular meetings for +new contributors, dev meetings, a dev mailing list, and a Slack for the contributor community. +All pandas contributors are welcome to these spaces, where they can connect with each other. Even +maintainers who have been with us for a long time felt just like you when they started out, and +are happy to welcome you and support you as you get to know how we work, and where things are. +Take a look at the next sections to learn more. .. _contributing.github: -Working with the code -===================== - -Now that you have an issue you want to fix, enhancement to add, or documentation to improve, -you need to learn how to work with GitHub and the pandas code base. +Submitting a pull request +========================= .. _contributing.version_control: Version control, Git, and GitHub -------------------------------- -To the new user, working with Git is one of the more daunting aspects of contributing to pandas. -It can very quickly become overwhelming, but sticking to the guidelines below will help keep the process -straightforward and mostly trouble free. As always, if you are having difficulties please -feel free to ask for help. - -The code is hosted on `GitHub `_. To -contribute you will need to sign up for a `free GitHub account +pandas is hosted on `GitHub `_, and to +contribute, you will need to sign up for a `free GitHub account `_. We use `Git `_ for version control to allow many people to work together on the project. -Some great resources for learning Git: +If you are new to Git, you can reference some of these resources for learning Git. Feel free to reach out +to the :ref:`contributor community ` for help if needed: + +* `Git documentation `_. + +Also, the project follows a forking workflow further described on this page whereby +contributors fork the repository, make changes and then create a pull request. +So please be sure to read and follow all the instructions in this guide. -* the `GitHub help pages `_. -* the `NumPy documentation `_. -* Matthew Brett's `Pydagogue `_. +If you are new to contributing to projects through forking on GitHub, +take a look at the `GitHub documentation for contributing to projects `_. +GitHub provides a quick tutorial using a test repository that may help you become more familiar +with forking a repository, cloning a fork, creating a feature branch, pushing changes and +making pull requests. + +Below are some useful resources for learning more about forking and pull requests on GitHub: + +* the `GitHub documentation for forking a repo `_. +* the `GitHub documentation for collaborating with pull requests `_. +* the `GitHub documentation for working with forks `_. Getting started with Git ------------------------ -`GitHub has instructions `__ for installing git, +`GitHub has instructions `__ for installing git, setting up your SSH key, and configuring git. All these steps need to be completed before you can work seamlessly between your local repository and GitHub. .. _contributing.forking: -Forking -------- +Create a fork of pandas +----------------------- -You will need your own fork to work on the code. Go to the `pandas project -page `_ and hit the ``Fork`` button. You will -want to clone your fork to your machine:: +You will need your own copy of pandas (aka fork) to work on the code. Go to the `pandas project +page `_ and hit the ``Fork`` button. Please uncheck the box to copy only the main branch before selecting ``Create Fork``. +You will want to clone your fork to your machine + +.. code-block:: shell git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream This creates the directory ``pandas-yourname`` and connects your repository to the upstream (main project) *pandas* repository. -Note that performing a shallow clone (with ``--depth==N``, for some ``N`` greater -or equal to 1) might break some tests and features as ``pd.show_versions()`` -as the version number cannot be computed anymore. +.. note:: -Creating a branch ------------------ + Performing a shallow clone (with ``--depth==N``, for some ``N`` greater + or equal to 1) might break some tests and features as ``pd.show_versions()`` + as the version number cannot be computed anymore. -You want your main branch to reflect only production-ready code, so create a -feature branch for making your changes. For example:: +Creating a feature branch +------------------------- - git branch shiny-new-feature - git checkout shiny-new-feature +Your local ``main`` branch should always reflect the current state of pandas repository. +First ensure it's up-to-date with the main pandas repository. -The above can be simplified to:: +.. code-block:: shell - git checkout -b shiny-new-feature + git checkout main + git pull upstream main --ff-only -This changes your working directory to the shiny-new-feature branch. Keep any -changes in this branch specific to one bug or feature so it is clear -what the branch brings to pandas. You can have many shiny-new-features -and switch in between them using the git checkout command. +Then, create a feature branch for making your changes. For example -When creating this branch, make sure your main branch is up to date with -the latest upstream main version. To update your local main branch, you -can do:: +.. code-block:: shell - git checkout main - git pull upstream main --ff-only + git checkout -b shiny-new-feature + +This changes your working branch from ``main`` to the ``shiny-new-feature`` branch. Keep any +changes in this branch specific to one bug or feature so it is clear +what the branch brings to pandas. You can have many feature branches +and switch in between them using the ``git checkout`` command. When you want to update the feature branch with changes in main after you created the branch, check the section on :ref:`updating a PR `. -Contributing your changes to pandas -===================================== - .. _contributing.commit-code: -Committing your code --------------------- +Making code changes +------------------- -Keep style fixes to a separate commit to make your pull request more readable. +Before modifying any code, ensure you follow the :ref:`contributing environment ` +guidelines to set up an appropriate development environment. -Once you've made changes, you can see them by typing:: +Then once you have made code changes, you can see all the changes you've currently made by running. + +.. code-block:: shell git status -If you have created a new file, it is not being tracked by git. Add it by typing:: +For files you intended to modify or add, run. + +.. code-block:: shell - git add path/to/file-to-be-added.py + git add path/to/file-to-be-added-or-changed.py -Doing 'git status' again should give something like:: +Running ``git status`` again should display - # On branch shiny-new-feature - # - # modified: /relative/path/to/file-you-added.py - # +.. code-block:: shell -Finally, commit your changes to your local repository with an explanatory message. pandas -uses a convention for commit message prefixes and layout. Here are -some common prefixes along with general guidelines for when to use them: + On branch shiny-new-feature -* ENH: Enhancement, new functionality -* BUG: Bug fix -* DOC: Additions/updates to documentation -* TST: Additions/updates to tests -* BLD: Updates to the build process/scripts -* PERF: Performance improvement -* TYP: Type annotations -* CLN: Code cleanup + modified: /relative/path/to/file-to-be-added-or-changed.py -The following defines how a commit message should be structured. Please reference the -relevant GitHub issues in your commit message using GH1234 or #1234. Either style -is fine, but the former is generally preferred: -* a subject line with ``< 80`` chars. -* One blank line. -* Optionally, a commit message body. +Finally, commit your changes to your local repository with an explanatory commit +message -Now you can commit your changes in your local repository:: +.. code-block:: shell - git commit -m + git commit -m "your commit message goes here" .. _contributing.push-code: @@ -225,17 +191,23 @@ Pushing your changes -------------------- When you want your changes to appear publicly on your GitHub page, push your -forked feature branch's commits:: +forked feature branch's commits + +.. code-block:: shell git push origin shiny-new-feature Here ``origin`` is the default name given to your remote repository on GitHub. -You can see the remote repositories:: +You can see the remote repositories + +.. code-block:: shell git remote -v If you added the upstream repository as described above you will see something -like:: +like + +.. code-block:: shell origin git@github.com:yourname/pandas.git (fetch) origin git@github.com:yourname/pandas.git (push) @@ -245,33 +217,32 @@ like:: Now your code is on GitHub, but it is not yet a part of the pandas project. For that to happen, a pull request needs to be submitted on GitHub. -Review your code ----------------- - -When you're ready to ask for a code review, file a pull request. Before you do, once -again make sure that you have followed all the guidelines outlined in this document -regarding code style, tests, performance tests, and documentation. You should also -double check your branch changes against the branch it was based on: +Making a pull request +--------------------- -#. Navigate to your repository on GitHub -- https://github.com/your-user-name/pandas -#. Click on ``Branches`` -#. Click on the ``Compare`` button for your feature branch -#. Select the ``base`` and ``compare`` branches, if necessary. This will be ``main`` and - ``shiny-new-feature``, respectively. +One you have finished your code changes, your code change will need to follow the +:ref:`pandas contribution guidelines ` to be successfully accepted. -Finally, make the pull request ------------------------------- - -If everything looks good, you are ready to make a pull request. A pull request is how -code from a local repository becomes available to the GitHub community and can be looked -at and eventually merged into the main version. This pull request and its associated -changes will eventually be committed to the main branch and available in the next -release. To submit a pull request: +If everything looks good, you are ready to make a pull request. A pull request is how +code from your local repository becomes available to the GitHub community to review +and merged into project to appear the in the next release. To submit a pull request: #. Navigate to your repository on GitHub -#. Click on the ``Pull Request`` button +#. Click on the ``Compare & pull request`` button #. You can then click on ``Commits`` and ``Files Changed`` to make sure everything looks okay one last time +#. Write a descriptive title that includes prefixes. pandas uses a convention for title + prefixes. Here are some common ones along with general guidelines for when to use them: + + * ENH: Enhancement, new functionality + * BUG: Bug fix + * DOC: Additions/updates to documentation + * TST: Additions/updates to tests + * BLD: Updates to the build process/scripts + * PERF: Performance improvement + * TYP: Type annotations + * CLN: Code cleanup + #. Write a description of your changes in the ``Preview Discussion`` tab #. Click ``Send Pull Request``. @@ -284,20 +255,13 @@ Updating your pull request -------------------------- Based on the review you get on your pull request, you will probably need to make -some changes to the code. In that case, you can make them in your branch, -add a new commit to that branch, push it to GitHub, and the pull request will be -automatically updated. Pushing them to GitHub again is done by:: +some changes to the code. You can follow the :ref:`code committing steps ` +again to address any feedback and update your pull request. - git push origin shiny-new-feature - -This will automatically update your pull request with the latest code and restart the -:any:`Continuous Integration ` tests. +It is also important that updates in the pandas ``main`` branch are reflected in your pull request. +To update your feature branch with changes in the pandas ``main`` branch, run: -Another reason you might need to update your pull request is to solve conflicts -with changes that have been merged into the main branch since you opened your -pull request. - -To do this, you need to "merge upstream main" in your branch:: +.. code-block:: shell git checkout shiny-new-feature git fetch upstream @@ -309,58 +273,62 @@ default commit message will open, and you can simply save and quit this file. If there are merge conflicts, you need to solve those conflicts. See for example at https://help.github.com/articles/resolving-a-merge-conflict-using-the-command-line/ for an explanation on how to do this. -Once the conflicts are merged and the files where the conflicts were solved are -added, you can run ``git commit`` to save those fixes. -If you have uncommitted changes at the moment you want to update the branch with -main, you will need to ``stash`` them prior to updating (see the -`stash docs `__). -This will effectively store your changes and they can be reapplied after updating. +Once the conflicts are resolved, run: -After the feature branch has been update locally, you can now update your pull -request by pushing to the branch on GitHub:: +#. ``git add -u`` to stage any files you've updated; +#. ``git commit`` to finish the merge. - git push origin shiny-new-feature +.. note:: -Autofixing formatting errors ----------------------------- + If you have uncommitted changes at the moment you want to update the branch with + ``main``, you will need to ``stash`` them prior to updating (see the + `stash docs `__). + This will effectively store your changes and they can be reapplied after updating. -We use several styling checks (e.g. ``black``, ``flake8``, ``isort``) which are run after -you make a pull request. +After the feature branch has been update locally, you can now update your pull +request by pushing to the branch on GitHub: -To automatically fix formatting errors on each commit you make, you can -set up pre-commit yourself. First, create a Python :ref:`environment -` and then set up :ref:`pre-commit `. +.. code-block:: shell -Delete your merged branch (optional) ------------------------------------- + git push origin shiny-new-feature -Once your feature branch is accepted into upstream, you'll probably want to get rid of -the branch. First, merge upstream main into your branch so git knows it is safe to -delete your branch:: +Any ``git push`` will automatically update your pull request with your branch's changes +and restart the :ref:`Continuous Integration ` checks. - git fetch upstream - git checkout main - git merge upstream/main +.. _contributing.update-dev: -Then you can do:: +Updating the development environment +------------------------------------ - git branch -d shiny-new-feature +It is important to periodically update your local ``main`` branch with updates from the pandas ``main`` +branch and update your development environment to reflect any changes to the various packages that +are used during development. -Make sure you use a lower-case ``-d``, or else git won't warn you if your feature -branch has not actually been merged. +If using :ref:`conda `, run: -The branch will still exist on GitHub, so to delete it there do:: +.. code-block:: shell - git push origin --delete shiny-new-feature + git checkout main + git fetch upstream + git merge upstream/main + conda activate pandas-dev + conda env update -f environment.yml --prune + +If using :ref:`pip ` , do: -.. _Gitter: https://gitter.im/pydata/pandas +.. code-block:: shell + git checkout main + git fetch upstream + git merge upstream/main + # activate the virtual environment based on your platform + python -m pip install --upgrade -r requirements-dev.txt Tips for a successful pull request ================================== -If you have made it to the `Review your code`_ phase, one of the core contributors may +If you have made it to the `Making a pull request`_ phase, one of the core contributors may take a look. Please note however that a handful of people are responsible for reviewing all of the contributions, which can often lead to bottlenecks. diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index c74c44fb1d5f0..45d4e24b0df51 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -18,40 +18,32 @@ tools will be run to check your code for stylistic errors. Generating any warnings will cause the test to fail. Thus, good style is a requirement for submitting code to pandas. -There is a tool in pandas to help contributors verify their changes before -contributing them to the project:: +There are a couple of tools in pandas to help contributors verify their changes +before contributing to the project - ./ci/code_checks.sh - -The script validates the doctests, formatting in docstrings, and -imported modules. It is possible to run the checks independently by using the -parameters ``docstring``, ``code``, and ``doctests`` -(e.g. ``./ci/code_checks.sh doctests``). +- ``./ci/code_checks.sh``: a script validates the doctests, formatting in docstrings, + and imported modules. It is possible to run the checks independently by using the + parameters ``docstrings``, ``code``, and ``doctests`` + (e.g. ``./ci/code_checks.sh doctests``); +- ``pre-commit``, which we go into detail on in the next section. In addition, because a lot of people use our library, it is important that we do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -In addition to ``./ci/code_checks.sh``, some extra checks (including static type -checking) are run by ``pre-commit`` - see :ref:`here ` -for how to run them. - .. _contributing.pre-commit: Pre-commit ---------- Additionally, :ref:`Continuous Integration ` will run code formatting checks -like ``black``, ``flake8`` (including a `pandas-dev-flaker `_ plugin), -``isort``, and ``cpplint`` and more using `pre-commit hooks `_ +like ``ruff``, +``isort``, and ``clang-format`` and more using `pre-commit hooks `_. Any warnings from these checks will cause the :ref:`Continuous Integration ` to fail; therefore, it is helpful to run the check yourself before submitting code. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: +can be done by installing ``pre-commit`` (which should already have happened if you followed the instructions +in :ref:`Setting up your development environment `) and then running:: pre-commit install @@ -63,17 +55,22 @@ remain up-to-date with our code checks as they change. Note that if needed, you can skip these checks with ``git commit --no-verify``. If you don't want to use ``pre-commit`` as part of your workflow, you can still use it -to run its checks with:: +to run its checks with one of the following:: pre-commit run --files + pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files without needing to have done ``pre-commit install`` beforehand. -If you want to run checks on all recently committed files on upstream/main you can use:: +Finally, we also have some slow pre-commit checks, which don't run on each commit +but which do run during continuous integration. You can trigger them manually with:: - pre-commit run --from-ref=upstream/main --to-ref=HEAD --all-files + pre-commit run --hook-stage manual --all-files -without needing to have done ``pre-commit install`` beforehand. +.. note:: + + You may want to periodically run ``pre-commit gc``, to clean up repos + which are no longer used. .. note:: @@ -84,6 +81,12 @@ without needing to have done ``pre-commit install`` beforehand. you may run into issues if you're using conda. To solve this, you can downgrade ``virtualenv`` to version ``20.0.33``. +.. note:: + + If you have recently merged in main from the upstream branch, some of the + dependencies used by ``pre-commit`` may have changed. Make sure to + :ref:`update your development environment `. + Optional dependencies --------------------- @@ -122,6 +125,7 @@ Otherwise, you need to do it manually: .. code-block:: python import warnings + from pandas.util._exceptions import find_stack_level def old_func(): @@ -130,7 +134,11 @@ Otherwise, you need to do it manually: .. deprecated:: 1.1.0 Use new_func instead. """ - warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) + warnings.warn( + 'Use new_func instead.', + FutureWarning, + stacklevel=find_stack_level(), + ) new_func() @@ -154,43 +162,9 @@ pandas strongly encourages the use of :pep:`484` style type hints. New developme Style guidelines ~~~~~~~~~~~~~~~~ -Type imports should follow the ``from typing import ...`` convention. Some types do not need to be imported since :pep:`585` some builtin constructs, such as ``list`` and ``tuple``, can directly be used for type annotations. So rather than - -.. code-block:: python - - import typing - - primes: typing.List[int] = [] - -You should write - -.. code-block:: python - - primes: list[int] = [] - -``Optional`` should be avoided in favor of the shorter ``| None``, so instead of - -.. code-block:: python - - from typing import Union - - maybe_primes: list[Union[int, None]] = [] - -or - -.. code-block:: python - - from typing import Optional - - maybe_primes: list[Optional[int]] = [] - -You should write - -.. code-block:: python - - from __future__ import annotations # noqa: F404 - - maybe_primes: list[int | None] = [] +Type imports should follow the ``from typing import ...`` convention. +Your code may be automatically re-written to use some modern constructs (e.g. using the built-in ``list`` instead of ``typing.List``) +by the :ref:`pre-commit checks `. In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like @@ -224,7 +198,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you obj = cast(str, obj) # Mypy complains without this! return obj.upper() -The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_). While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable .. code-block:: python @@ -256,13 +230,21 @@ This module will ultimately house types for repeatedly used concepts like "path- Validating type hints ~~~~~~~~~~~~~~~~~~~~~ -pandas uses `mypy `_ and `pyright `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running +pandas uses `mypy `_ and `pyright `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are consistent by running .. code-block:: shell - pre-commit run --hook-stage manual --all-files + pre-commit run --hook-stage manual --all-files mypy + pre-commit run --hook-stage manual --all-files pyright + pre-commit run --hook-stage manual --all-files pyright_reportGeneralTypeIssues + # the following might fail if the installed pandas version does not correspond to your local git version + pre-commit run --hook-stage manual --all-files stubtest + +in your python environment. -in your activated python environment. A recent version of ``numpy`` (>=1.22.0) is required for type validation. +.. warning:: + + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. .. _contributing.ci: @@ -271,7 +253,7 @@ Testing type hints in code using pandas .. warning:: - * Pandas is not yet a py.typed library (:pep:`561`)! + * pandas is not yet a py.typed library (:pep:`561`)! The primary purpose of locally declaring pandas as a py.typed library is to test and improve the pandas-builtin type annotations. @@ -316,6 +298,12 @@ So, before actually writing any code, you should write your tests. Often the te taken from the original GitHub issue. However, it is always worth considering additional use cases and writing corresponding tests. +We use `code coverage `_ to help understand +the amount of code which is covered by a test. We recommend striving to ensure code +you add or change within Pandas is covered by a test. Please see our +`code coverage dashboard through Codecov `_ +for more information. + Adding tests is one of the most common requests after code is pushed to pandas. Therefore, it is worth getting in the habit of writing tests ahead of time so this is never an issue. @@ -324,7 +312,22 @@ Writing tests All tests should go into the ``tests`` subdirectory of the specific package. This folder contains many current examples of tests, and we suggest looking to these for -inspiration. Ideally, there should be one, and only one, obvious place for a test to reside. +inspiration. + +As a general tip, you can use the search functionality in your integrated development +environment (IDE) or the git grep command in a terminal to find test files in which the method +is called. If you are unsure of the best location to put your test, take your best guess, +but note that reviewers may request that you move the test to a different location. + +To use git grep, you can run the following command in a terminal: + +``git grep "function_name("`` + +This will search through all files in your repository for the text ``function_name(``. +This can be a useful way to quickly locate the function in the +codebase and determine the best location to add a test for it. + +Ideally, there should be one, and only one, obvious place for a test to reside. Until we reach that ideal, these are some rules of thumb for where a test should be located. @@ -341,7 +344,7 @@ be located. - tests.scalar - tests.tseries.offsets -2. Does your test depend only on code in pd._libs? +2. Does your test depend only on code in ``pd._libs``? This test likely belongs in one of: - tests.libs @@ -459,6 +462,12 @@ be located. - tests.io + .. note:: + + This includes ``to_string`` but excludes ``__repr__``, which is + tested in ``tests.frame.test_repr`` and ``tests.series.test_repr``. + Other classes often have a ``test_formats`` file. + C) Otherwise This test likely belongs in one of: @@ -478,7 +487,7 @@ be located. 8) Is your test for one of the pandas-provided ExtensionArrays (``Categorical``, ``DatetimeArray``, ``TimedeltaArray``, ``PeriodArray``, ``IntervalArray``, - ``PandasArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? + ``NumpyExtensionArray``, ``FloatArray``, ``BoolArray``, ``StringArray``)? This test likely belongs in one of: - tests.arrays @@ -531,7 +540,7 @@ If a test is known to fail but the manner in which it fails is not meant to be captured, use ``pytest.mark.xfail`` It is common to use this method for a test that exhibits buggy behavior or a non-implemented feature. If the failing test has flaky behavior, use the argument ``strict=False``. This -will make it so pytest does not fail if the test happens to pass. +will make it so pytest does not fail if the test happens to pass. Using ``strict=False`` is highly undesirable, please use it only as a last resort. Prefer the decorator ``@pytest.mark.xfail`` and the argument ``pytest.param`` over usage within a test so that the test is appropriately marked during the @@ -543,7 +552,7 @@ xfail during the testing phase. To do so, use the ``request`` fixture: def test_xfail(request): mark = pytest.mark.xfail(raises=TypeError, reason="Indicate why here") - request.node.add_marker(mark) + request.applymarker(mark) xfail is not to be used for tests involving failure due to invalid user arguments. For these tests, we need to verify the correct exception type and error message @@ -554,11 +563,12 @@ is being raised, using ``pytest.raises`` instead. Testing a warning ^^^^^^^^^^^^^^^^^ -Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning. +Use ``tm.assert_produces_warning`` as a context manager to check that a block of code raises a warning +and specify the warning message using the ``match`` argument. .. code-block:: python - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(DeprecationWarning, match="the warning message"): pd.deprecated_function() If a warning should specifically not happen in a block of code, pass ``False`` into the context manager. @@ -579,16 +589,6 @@ ignore the error. def test_thing(self): pass -If you need finer-grained control, you can use Python's -`warnings module `__ -to control whether a warning is ignored or raised at different places within -a single test. - -.. code-block:: python - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - Testing an exception ^^^^^^^^^^^^^^^^^^^^ @@ -603,41 +603,36 @@ with the specific exception subclass (i.e. never use :py:class:`Exception`) and Testing involving files ^^^^^^^^^^^^^^^^^^^^^^^ -The ``tm.ensure_clean`` context manager creates a temporary file for testing, -with a generated filename (or your filename if provided), that is automatically -deleted when the context block is exited. +The ``temp_file`` pytest fixture creates a temporary file :py:class:`Pathlib` object for testing: .. code-block:: python - with tm.ensure_clean('my_file_path') as path: - # do something with the path + def test_something(temp_file): + pd.DataFrame([1]).to_csv(str(temp_file)) + +Please reference `pytest's documentation `_ +for the file retention policy. Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and -lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the -``tm.network`` decorator. - -.. code-block:: python - - @tm.network # noqa - def test_network(): - result = package.call_to_internet() - -If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator. +A unit test should not access a public data set over the internet due to flakiness of network connections and +lack of ownership of the server that is being connected to. To mock this interaction, use the ``httpserver`` fixture from the +`pytest-localserver plugin. `_ with synthetic data. .. code-block:: python - @tm.network("https://www.somespecificsite.com", check_before_test=True) - def test_network(): - result = pd.read_html("https://www.somespecificsite.com") + @pytest.mark.network + @pytest.mark.single_cpu + def test_network(httpserver): + httpserver.serve_content(content="content") + result = pd.read_html(httpserver.url) Example ^^^^^^^ Here is an example of a self-contained set of tests in a file ``pandas/tests/test_cool_feature.py`` -that illustrate multiple features that we like to use. Please remember to add the Github Issue Number +that illustrate multiple features that we like to use. Please remember to add the GitHub Issue Number as a comment to a new test. .. code-block:: python @@ -754,6 +749,7 @@ preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. +.. _contributing.running_tests: Running the test suite ---------------------- @@ -763,6 +759,14 @@ install pandas) by typing:: pytest pandas +.. note:: + + If a handful of tests don't pass, it may not be an issue with your pandas installation. + Some tests (e.g. some SQLAlchemy ones) require additional setup, others might start + failing because a non-pinned library released a new version, and others might be flaky + if run in parallel. As long as you can import pandas from your locally built version, + your installation is probably fine and you can start contributing! + Often it is worth running only a subset of tests first around your changes before running the entire suite. @@ -776,25 +780,71 @@ Or with one of the following constructs:: pytest pandas/tests/[test-module].py::[TestClass] pytest pandas/tests/[test-module].py::[TestClass]::[test_method] -Using `pytest-xdist `_, one can -speed up local testing on multicore machines. To use this feature, you will -need to install ``pytest-xdist`` via:: +Using `pytest-xdist `_, which is +included in our 'pandas-dev' environment, one can speed up local testing on +multicore machines. The ``-n`` number flag then can be specified when running +pytest to parallelize a test run across the number of specified cores or auto to +utilize all the available cores on your machine. + +.. code-block:: bash + + # Utilize 4 cores + pytest -n 4 pandas + + # Utilizes all available cores + pytest -n auto pandas + +If you'd like to speed things along further a more advanced use of this +command would look like this + +.. code-block:: bash + + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX + +In addition to the multithreaded performance increase this improves test +speed by skipping some tests using the ``-m`` mark flag: + +- slow: any test taking long (think seconds rather than milliseconds) +- network: tests requiring network connectivity +- db: tests requiring a database (mysql or postgres) +- single_cpu: tests that should run on a single cpu only + +You might want to enable the following option if it's relevant for you: + +- arm_slow: any test taking long on arm64 architecture + +These markers are defined `in this toml file `_ +, under ``[tool.pytest.ini_options]`` in a list called ``markers``, in case +you want to check if new ones have been created which are of interest to you. + +The ``-r`` report flag will display a short summary info (see `pytest +documentation `_) +. Here we are displaying the number of: + +- s: skipped tests +- x: xfailed tests +- X: xpassed tests - pip install pytest-xdist +The summary is optional and can be removed if you don't need the added +information. Using the parallelization option can significantly reduce the +time it takes to locally run tests before submitting a pull request. -Two scripts are provided to assist with this. These scripts distribute -testing across 4 threads. +If you require assistance with the results, +which has happened in the past, please set a seed before running the command +and opening a bug report, that way we can reproduce it. Here's an example +for setting a seed on windows -On Unix variants, one can type:: +.. code-block:: bash - test_fast.sh + set PYTHONHASHSEED=314159265 + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX -On Windows, one can type:: +On Unix use - test_fast.bat +.. code-block:: bash -This can significantly reduce the time it takes to locally run tests before -submitting a pull request. + export PYTHONHASHSEED=314159265 + pytest pandas -n 4 -m "not slow and not network and not db and not single_cpu" -r sxX For more, see the `pytest `_ documentation. @@ -814,7 +864,7 @@ performance regressions. pandas is in the process of migrating to `asv benchmarks `__ to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the ``pandas/asv_bench`` directory, and the -test results can be found `here `__. +test results can be found `here `__. To use all features of asv, you will need either ``conda`` or ``virtualenv``. For more details please check the `asv installation @@ -906,9 +956,9 @@ directive is used. The sphinx syntax for that is: .. code-block:: rst - .. versionadded:: 1.1.0 + .. versionadded:: 2.1.0 -This will put the text *New in version 1.1.0* wherever you put the sphinx +This will put the text *New in version 2.1.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index a87d8d5ad44bf..59d7957275e15 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -67,8 +67,6 @@ case of pandas, the NumPy docstring convention is followed. These conventions ar explained in this document: * `numpydoc docstring guide `_ - (which is based in the original `Guide to NumPy/SciPy documentation - `_) numpydoc is a Sphinx extension to support the NumPy docstring convention. @@ -144,7 +142,7 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature ``def func():``. + It has a blank line after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. @@ -654,9 +652,9 @@ A simple example could be: Examples -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', + >>> ser = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', ... 'Lion', 'Monkey', 'Rabbit', 'Zebra']) - >>> s.head() + >>> ser.head() 0 Ant 1 Bear 2 Cow @@ -666,7 +664,7 @@ A simple example could be: With the ``n`` parameter, we can change the number of returned rows: - >>> s.head(n=3) + >>> ser.head(n=3) 0 Ant 1 Bear 2 Cow @@ -697,10 +695,10 @@ and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). -When illustrating examples with a single ``Series`` use the name ``s``, and if +When illustrating examples with a single ``Series`` use the name ``ser``, and if illustrating with a single ``DataFrame`` use the name ``df``. For indices, ``idx`` is the preferred name. If a set of homogeneous ``Series`` or -``DataFrame`` is used, name them ``s1``, ``s2``, ``s3``... or ``df1``, +``DataFrame`` is used, name them ``ser1``, ``ser2``, ``ser3``... or ``df1``, ``df2``, ``df3``... If the data is not homogeneous, and more than one structure is needed, name them with something meaningful, for example ``df_main`` and ``df_to_join``. @@ -733,8 +731,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.mean() + >>> ser = pd.Series([1, 2, 3]) + >>> ser.mean() 2 """ pass @@ -746,8 +744,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([1, np.nan, 3]) - >>> s.fillna(0) + >>> ser = pd.Series([1, np.nan, 3]) + >>> ser.fillna(0) [1, 0, 3] """ pass @@ -758,10 +756,10 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([380., 370., 24., 26], + >>> ser = pd.Series([380., 370., 24., 26], ... name='max_speed', ... index=['falcon', 'falcon', 'parrot', 'parrot']) - >>> s.groupby_mean() + >>> ser.groupby_mean() index falcon 375.0 parrot 25.0 @@ -778,8 +776,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) - >>> s.contains(pattern='a') + >>> ser = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) + >>> ser.contains(pattern='a') 0 False 1 False 2 True @@ -802,7 +800,7 @@ positional arguments ``head(3)``. We can fill missing values in the output using the ``na`` parameter: - >>> s.contains(pattern='a', na=False) + >>> ser.contains(pattern='a', na=False) 0 False 1 False 2 True @@ -922,8 +920,8 @@ plot will be generated automatically when building the documentation. .. plot:: :context: close-figs - >>> s = pd.Series([1, 2, 3]) - >>> s.plot() + >>> ser = pd.Series([1, 2, 3]) + >>> ser.plot() """ pass @@ -941,8 +939,8 @@ Each shared docstring will have a base template with variables, like Finally, docstrings can also be appended to with the ``doc`` decorator. In this example, we'll create a parent docstring normally (this is like -``pandas.core.generic.NDFrame``. Then we'll have two children (like -``pandas.core.series.Series`` and ``pandas.core.frame.DataFrame``). We'll +``pandas.core.generic.NDFrame``). Then we'll have two children (like +``pandas.Series`` and ``pandas.DataFrame``). We'll substitute the class names in this docstring. .. code-block:: python @@ -997,5 +995,5 @@ mapping function names to docstrings. Wherever possible, we prefer using ``doc``, since the docstring-writing processes is slightly closer to normal. See ``pandas.core.generic.NDFrame.fillna`` for an example template, and -``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` +``pandas.Series.fillna`` and ``pandas.core.generic.frame.fillna`` for the filled versions. diff --git a/doc/source/development/contributing_documentation.rst b/doc/source/development/contributing_documentation.rst index fac6a91ce82f2..443470e6c50f9 100644 --- a/doc/source/development/contributing_documentation.rst +++ b/doc/source/development/contributing_documentation.rst @@ -14,7 +14,7 @@ experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help the next person. Please visit the `issues page `__ for a full list of issues that are currently open regarding the -Pandas documentation. +pandas documentation. @@ -127,6 +127,7 @@ for some tips and tricks to get the doctests passing. When doing a PR with a docstring update, it is good to post the output of the validation script in a comment on github. +.. _contributing.howto-build-docs: How to build the pandas documentation --------------------------------------- @@ -188,9 +189,7 @@ to speed up the documentation build. You can override this:: python make.py html --num-jobs 4 Open the following file in a web browser to see the full documentation you -just built:: - - doc/build/html/index.html +just built ``doc/build/html/index.html``. And you'll have the satisfaction of seeing your new and improved documentation! diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index c881770aa7584..98bd4b00d016b 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -10,111 +10,52 @@ To test out code changes, you'll need to build pandas from source, which requires a C/C++ compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing to the documentation ` but if you skip creating the development environment you won't be able to build the documentation -locally before pushing your changes. +locally before pushing your changes. It's recommended to also install the :ref:`pre-commit hooks `. -.. contents:: Table of contents: - :local: +.. toctree:: + :maxdepth: 2 + :hidden: + contributing_gitpod.rst -Creating an environment using Docker --------------------------------------- +Step 1: install a C compiler +---------------------------- -Instead of manually setting up a development environment, you can use `Docker -`_ to automatically create the environment with just several -commands. pandas provides a ``DockerFile`` in the root directory to build a Docker image -with a full pandas development environment. - -**Docker Commands** - -Build the Docker image:: - - # Build the image pandas-yourname-env - docker build --tag pandas-yourname-env . - # Or build the image by passing your GitHub username to use your own fork - docker build --build-arg gh_username=yourname --tag pandas-yourname-env . - -Run Container:: - - # Run a container and bind your local repo to the container - docker run -it -w /home/pandas --rm -v path-to-local-pandas-repo:/home/pandas pandas-yourname-env - -.. note:: - If you bind your local repo for the first time, you have to build the C extensions afterwards. - Run the following command inside the container:: - - python setup.py build_ext -j 4 - - You need to rebuild the C extensions anytime the Cython code in ``pandas/_libs`` changes. - This most frequently occurs when changing or merging branches. - -*Even easier, you can integrate Docker with the following IDEs:* - -**Visual Studio Code** - -You can use the DockerFile to launch a remote session with Visual Studio Code, -a popular free IDE, using the ``.devcontainer.json`` file. -See https://code.visualstudio.com/docs/remote/containers for details. - -**PyCharm (Professional)** - -Enable Docker support and use the Services tool window to build and manage images as well as -run and interact with containers. -See https://www.jetbrains.com/help/pycharm/docker.html for details. - -Creating an environment without Docker ---------------------------------------- - -Installing a C compiler -~~~~~~~~~~~~~~~~~~~~~~~ - -pandas uses C extensions (mostly written using Cython) to speed up certain -operations. To install pandas from source, you need to compile these C -extensions, which means you need a C compiler. This process depends on which -platform you're using. - -If you have setup your environment using ``conda``, the packages ``c-compiler`` -and ``cxx-compiler`` will install a fitting compiler for your platform that is -compatible with the remaining conda packages. On Windows and macOS, you will -also need to install the SDKs as they have to be distributed separately. -These packages will automatically be installed by using the ``pandas`` -``environment.yml`` file. +How to do this will depend on your platform. If you choose to use ``Docker`` or ``GitPod`` +in the next step, then you can skip this step. **Windows** -You will need `Build Tools for Visual Studio 2019 -`_. +You will need `Build Tools for Visual Studio 2022 +`_. -.. warning:: - You DO NOT need to install Visual Studio 2019. - You only need "Build Tools for Visual Studio 2019" found by - scrolling down to "All downloads" -> "Tools for Visual Studio 2019". - In the installer, select the "C++ build tools" workload. - -You can install the necessary components on the commandline using -`vs_buildtools.exe `_: +.. note:: + You DO NOT need to install Visual Studio 2022. + You only need "Build Tools for Visual Studio 2022" found by + scrolling down to "All downloads" -> "Tools for Visual Studio". + In the installer, select the "Desktop development with C++" Workloads. -.. code:: + If you encounter an error indicating ``cl.exe`` is not found when building with Meson, + reopen the installer and also select the optional component + **MSVC v142 - VS 2019 C++ x64/x86 build tools** in the right pane for installation. - vs_buildtools.exe --quiet --wait --norestart --nocache ^ - --installPath C:\BuildTools ^ - --add "Microsoft.VisualStudio.Workload.VCTools;includeRecommended" ^ - --add Microsoft.VisualStudio.Component.VC.v141 ^ - --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 ^ - --add Microsoft.VisualStudio.Component.Windows10SDK.17763 +Alternatively, you can install the necessary components on the commandline using +`vs_BuildTools.exe `_ -To setup the right paths on the commandline, call -``"C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat" -vcvars_ver=14.16 10.0.17763.0``. +Alternatively, you could use the `WSL `_ +and consult the ``Linux`` instructions below. **macOS** -To use the ``conda``-based compilers, you will need to install the -Developer Tools using ``xcode-select --install``. Otherwise -information about compiler installation can be found here: +To use the :ref:`conda `-based compilers, you will need to install the +Developer Tools using ``xcode-select --install``. + +If you prefer to use a different compiler, general information can be found here: https://devguide.python.org/setup/#macos **Linux** -For Linux-based ``conda`` installations, you won't have to install any +For Linux-based :ref:`conda ` installations, you won't have to install any additional components outside of the conda environment. The instructions below are only needed if your setup isn't based on conda environments. @@ -128,75 +69,41 @@ which compilers (and versions) are installed on your system:: `GCC (GNU Compiler Collection) `_, is a widely used compiler, which supports C and a number of other languages. If GCC is listed -as an installed compiler nothing more is required. If no C compiler is -installed (or you wish to install a newer version) you can install a compiler -(GCC in the example code below) with:: - - # for recent Debian/Ubuntu: - sudo apt install build-essential - # for Red Had/RHEL/CentOS/Fedora - yum groupinstall "Development Tools" +as an installed compiler nothing more is required. -For other Linux distributions, consult your favorite search engine for -compiler installation instructions. +If no C compiler is installed, or you wish to upgrade, or you're using a different +Linux distribution, consult your favorite search engine for compiler installation/update +instructions. -Let us know if you have any difficulties by opening an issue or reaching out on `Gitter `_. +Let us know if you have any difficulties by opening an issue or reaching out on our contributor +community :ref:`Slack `. -Creating a Python environment -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Step 2: create an isolated environment +---------------------------------------- -Now create an isolated pandas development environment: +Before we begin, please: -* Install either `Anaconda `_, `miniconda - `_, or `miniforge `_ -* Make sure your conda is up to date (``conda update conda``) * Make sure that you have :any:`cloned the repository ` -* ``cd`` to the pandas source directory - -We'll now kick off a three-step process: - -1. Install the build dependencies -2. Build and install pandas -3. Install the optional dependencies - -.. code-block:: none - - # Create and activate the build environment - conda env create -f environment.yml - conda activate pandas-dev - - # or with older versions of Anaconda: - source activate pandas-dev - - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - -At this point you should be able to import pandas from your locally built version:: - - $ python - >>> import pandas - >>> print(pandas.__version__) - 0.22.0.dev0+29.g4ad6d4d74 - -This will create the new environment, and not touch any of your existing environments, -nor any existing Python installation. +* ``cd`` to the pandas source directory you just created with the clone command -To view your environments:: +.. _contributing.conda: - conda info -e +Option 1: using conda (recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To return to your root environment:: +* Install miniforge to get `conda `_ +* Create and activate the ``pandas-dev`` conda environment using the following commands: - conda deactivate +.. code-block:: bash -See the full conda docs `here `__. + conda env create --file environment.yml + conda activate pandas-dev +.. _contributing.pip: -Creating a Python environment (pip) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Option 2: using pip +~~~~~~~~~~~~~~~~~~~ -If you aren't using conda for your development environment, follow these instructions. You'll need to have at least the :ref:`minimum Python version ` that pandas supports. You also need to have ``setuptools`` 51.0.0 or later to build pandas. @@ -215,10 +122,6 @@ You also need to have ``setuptools`` 51.0.0 or later to build pandas. # Install the build dependencies python -m pip install -r requirements-dev.txt - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - **Unix**/**macOS with pyenv** Consult the docs for setting up pyenv `here `__. @@ -227,11 +130,10 @@ Consult the docs for setting up pyenv `here `__. # Create a virtual environment # Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev - pyenv virtualenv # For instance: - pyenv virtualenv 3.9.10 pandas-dev + pyenv virtualenv 3.10 pandas-dev # Activate the virtualenv pyenv activate pandas-dev @@ -239,19 +141,15 @@ Consult the docs for setting up pyenv `here `__. # Now install the build dependencies in the cloned pandas repo python -m pip install -r requirements-dev.txt - # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 - **Windows** Below is a brief overview on how to set-up a virtual environment with Powershell under Windows. For details please refer to the -`official virtualenv user guide `__ +`official virtualenv user guide `__. -Use an ENV_DIR of your choice. We'll use ~\\virtualenvs\\pandas-dev where -'~' is the folder pointed to by either $env:USERPROFILE (Powershell) or -%USERPROFILE% (cmd.exe) environment variable. Any parent directories +Use an ENV_DIR of your choice. We'll use ``~\\virtualenvs\\pandas-dev`` where +``~`` is the folder pointed to by either ``$env:USERPROFILE`` (Powershell) or +``%USERPROFILE%`` (cmd.exe) environment variable. Any parent directories should already exist. .. code-block:: powershell @@ -265,6 +163,151 @@ should already exist. # Install the build dependencies python -m pip install -r requirements-dev.txt +Option 3: using Docker +~~~~~~~~~~~~~~~~~~~~~~ + +pandas provides a ``DockerFile`` in the root directory to build a Docker image +with a full pandas development environment. + +**Docker Commands** + +Build the Docker image:: + + # Build the image + docker build -t pandas-dev . + +Run Container:: + + # Run a container and bind your local repo to the container + # This command assumes you are running from your local repo + # but if not alter ${PWD} to match your local repo path + docker run -it --rm -v ${PWD}:/home/pandas pandas-dev + +*Even easier, you can integrate Docker with the following IDEs:* + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the ``.devcontainer.json`` file. +See https://code.visualstudio.com/docs/remote/containers for details. + +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Option 4: using Gitpod +~~~~~~~~~~~~~~~~~~~~~~ + +Gitpod is an open-source platform that automatically creates the correct development +environment right in your browser, reducing the need to install local development +environments and deal with incompatible dependencies. + +If you are a Windows user, unfamiliar with using the command line or building pandas +for the first time, it is often faster to build with Gitpod. Here are the in-depth instructions +for :ref:`building pandas with GitPod `. + +Step 3: build and install pandas +-------------------------------- + +There are currently two supported ways of building pandas, pip/meson and setuptools(setup.py). +Historically, pandas has only supported using setuptools to build pandas. However, this method +requires a lot of convoluted code in setup.py and also has many issues in compiling pandas in parallel +due to limitations in setuptools. + +The newer build system, invokes the meson backend through pip (via a `PEP 517 `_ build). +It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by +rebuilding automatically whenever pandas is imported (with an editable install). + +For these reasons, you should compile pandas with meson. +Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs +`here `_. + +To compile pandas with meson, run:: + # Build and install pandas - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 + # By default, this will print verbose output + # showing the "rebuild" taking place on import (see section below for explanation) + # If you do not want to see this, omit everything after --no-build-isolation + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true + +.. note:: + The version number is pulled from the latest repository tag. Be sure to fetch the latest tags from upstream + before building:: + + # set the upstream repository, if not done already, and fetch the latest tags + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream --tags + +**Build options** + +It is possible to pass options from the pip frontend to the meson backend if you would like to configure your +install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels. + +You can pass a build directory to pandas by appending ``-Cbuilddir="your builddir here"`` to your pip command. +This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. + +Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. +Appending ``-Csetup-args="-Ddebug=true"`` will do the trick. + +With pip, it is possible to chain together multiple config settings (for example specifying both a build directory +and building with debug symbols would look like +``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``. + +**Compiling pandas with setup.py** + +.. note:: + This method of compiling pandas will be deprecated and removed very soon, as the meson backend matures. + +To compile pandas with setuptools, run:: + + python setup.py develop + +.. note:: + If pandas is already installed (via meson), you have to uninstall it first:: + + python -m pip uninstall pandas + +This is because python setup.py develop will not uninstall the loader script that ``meson-python`` +uses to import the extension from the build folder, which may cause errors such as an +``FileNotFoundError`` to be raised. + +.. note:: + You will need to repeat this step each time the C extensions change, for example + if you modified any file in ``pandas/_libs`` or if you did a fetch and merge from ``upstream/main``. + +**Checking the build** + +At this point you should be able to import pandas from your locally built version:: + + $ python + >>> import pandas + >>> print(pandas.__version__) # note: the exact output may differ + 2.0.0.dev0+880.g2b9e661fbb.dirty + + +At this point you may want to try +`running the test suite `_. + +**Keeping up to date with the latest build** + +When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. +By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's +output when importing pandas, you can set the environment variable ``MESONPY_EDITABLE_VERBOSE``. For example, this would be:: + + # On Linux/macOS + MESONPY_EDITABLE_VERBOSE=1 python + + # Windows + set MESONPY_EDITABLE_VERBOSE=1 # Only need to set this once per session + python + +If you would like to see this verbose output every time, you can set the ``editable-verbose`` config setting to ``true`` like so:: + + python -m pip install -ve . -Ceditable-verbose=true + +.. tip:: + If you ever find yourself wondering whether setuptools or meson was used to build your pandas, + you can check the value of ``pandas._built_with_meson``, which will be true if meson was used + to compile pandas. diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst new file mode 100644 index 0000000000000..b70981b4d307d --- /dev/null +++ b/doc/source/development/contributing_gitpod.rst @@ -0,0 +1,274 @@ +.. _contributing-gitpod: + +Using Gitpod for pandas development +=================================== + +This section of the documentation will guide you through: + +* using Gitpod for your pandas development environment +* creating a personal fork of the pandas repository on GitHub +* a quick tour of pandas and VSCode +* working on the pandas documentation in Gitpod + +Gitpod +------ + +`Gitpod`_ is an open-source platform for automated and ready-to-code +development environments. It enables developers to describe their dev +environment as code and start instant and fresh development environments for +each new task directly from your browser. This reduces the need to install local +development environments and deal with incompatible dependencies. + + +Gitpod GitHub integration +------------------------- + +To be able to use Gitpod, you will need to have the Gitpod app installed on your +GitHub account, so if +you do not have an account yet, you will need to create one first. + +To get started just login at `Gitpod`_, and grant the appropriate permissions to GitHub. + +We have built a python 3.10 environment and all development dependencies will +install when the environment starts. + + +Forking the pandas repository +----------------------------- + +The best way to work on pandas as a contributor is by making a fork of the +repository first. + +#. Browse to the `pandas repository on GitHub`_ and `create your own fork`_. + +#. Browse to your fork. Your fork will have a URL like + https://github.com/noatamir/pandas-dev, except with your GitHub username in place of + ``noatamir``. + +Starting Gitpod +--------------- +Once you have authenticated to Gitpod through GitHub, you can install the +`Gitpod Chromium or Firefox browser extension `_ +which will add a **Gitpod** button next to the **Code** button in the +repository: + +.. image:: ./gitpod-imgs/pandas-github.png + :alt: pandas repository with Gitpod button screenshot + +#. If you install the extension - you can click the **Gitpod** button to start + a new workspace. + +#. Alternatively, if you do not want to install the browser extension, you can + visit https://gitpod.io/#https://github.com/USERNAME/pandas replacing + ``USERNAME`` with your GitHub username. + +#. In both cases, this will open a new tab on your web browser and start + building your development environment. Please note this can take a few + minutes. + +#. Once the build is complete, you will be directed to your workspace, + including the VSCode editor and all the dependencies you need to work on + pandas. The first time you start your workspace, you will notice that there + might be some actions running. This will ensure that you have a development + version of pandas installed. + +#. When your workspace is ready, you can :ref:`test the build` by + entering:: + + $ python -m pytest pandas + + Note that this command takes a while to run, so once you've confirmed it's running you may want to cancel it using ctrl-c. + +Quick workspace tour +-------------------- +Gitpod uses VSCode as the editor. If you have not used this editor before, you +can check the Getting started `VSCode docs`_ to familiarize yourself with it. + +Your workspace will look similar to the image below: + +.. image:: ./gitpod-imgs/gitpod-workspace.png + :alt: Gitpod workspace screenshot + +We have marked some important sections in the editor: + +#. Your current Python interpreter - by default, this is ``pandas-dev`` and + should be displayed in the status bar and on your terminal. You do not need + to activate the conda environment as this will always be activated for you. +#. Your current branch is always displayed in the status bar. You can also use + this button to change or create branches. +#. GitHub Pull Requests extension - you can use this to work with Pull Requests + from your workspace. +#. Marketplace extensions - we have added some essential extensions to the pandas + Gitpod. Still, you can also install other extensions or syntax highlighting + themes for your user, and these will be preserved for you. +#. Your workspace directory - by default, it is ``/workspace/pandas-dev``. **Do not + change this** as this is the only directory preserved in Gitpod. + +We have also pre-installed a few tools and VSCode extensions to help with the +development experience: + +* `VSCode rst extension `_ +* `Markdown All in One `_ +* `VSCode GitLens extension `_ +* `VSCode Git Graph extension `_ + +Development workflow with Gitpod +-------------------------------- +The :ref:`contributing` section of this documentation contains +information regarding the pandas development workflow. Make sure to check this +before working on your contributions. + +When using Gitpod, git is pre configured for you: + +#. You do not need to configure your git username, and email as this should be + done for you as you authenticated through GitHub. Unless you are using GitHub + feature to keep email address private. You can check the git + configuration with the command ``git config --list`` in your terminal. Use + ``git config --global user.email “your-secret-email@users.noreply.github.com”`` + to set your email address to the one you use to make commits with your github + profile. +#. As you started your workspace from your own pandas fork, you will by default + have both ``upstream`` and ``origin`` added as remotes. You can verify this by + typing ``git remote`` on your terminal or by clicking on the **branch name** + on the status bar (see image below). + + .. image:: ./gitpod-imgs/pandas-gitpod-branches.png + :alt: Gitpod workspace branches plugin screenshot + +Rendering the pandas documentation +---------------------------------- +You can find the detailed documentation on how rendering the documentation with +Sphinx works in the :ref:`contributing.howto-build-docs` section. To build the full +docs you need to run the following command in the ``/doc`` directory:: + + $ cd doc + $ python make.py html + +Alternatively you can build a single page with:: + + python make.py --single development/contributing_gitpod.rst + +You have two main options to render the documentation in Gitpod. + +Option 1: using Liveserve +~~~~~~~~~~~~~~~~~~~~~~~~~ + +#. View the documentation in ``pandas/doc/build/html``. +#. To see the rendered version of a page, you can right-click on the ``.html`` + file and click on **Open with Live Serve**. Alternatively, you can open the + file in the editor and click on the **Go live** button on the status bar. + + .. image:: ./gitpod-imgs/vscode-statusbar.png + :alt: Gitpod workspace VSCode start live serve screenshot + +#. A simple browser will open to the right-hand side of the editor. We recommend + closing it and click on the **Open in browser** button in the pop-up. +#. To stop the server click on the **Port: 5500** button on the status bar. + +Option 2: using the rst extension +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A quick and easy way to see live changes in a ``.rst`` file as you work on it +uses the rst extension with docutils. + +.. note:: This will generate a simple live preview of the document without the + ``html`` theme, and some backlinks might not be added correctly. But it is an + easy and lightweight way to get instant feedback on your work, without + building the html files. + +#. Open any of the source documentation files located in ``doc/source`` in the + editor. +#. Open VSCode Command Palette with :kbd:`Cmd-Shift-P` in Mac or + :kbd:`Ctrl-Shift-P` in Linux and Windows. Start typing "restructured" + and choose either "Open preview" or "Open preview to the Side". + + .. image:: ./gitpod-imgs/vscode-rst.png + :alt: Gitpod workspace VSCode open rst screenshot + +#. As you work on the document, you will see a live rendering of it on the editor. + + .. image:: ./gitpod-imgs/rst-rendering.png + :alt: Gitpod workspace VSCode rst rendering screenshot + +If you want to see the final output with the ``html`` theme you will need to +rebuild the docs with ``make html`` and use Live Serve as described in option 1. + +FAQ's and troubleshooting +------------------------- + +How long is my Gitpod workspace kept for? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Your stopped workspace will be kept for 14 days and deleted afterwards if you do +not use them. + +Can I come back to a previous workspace? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Yes, let's say you stepped away for a while and you want to carry on working on +your pandas contributions. You need to visit https://gitpod.io/workspaces and +click on the workspace you want to spin up again. All your changes will be there +as you last left them. + +Can I install additional VSCode extensions? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Absolutely! Any extensions you installed will be installed in your own workspace +and preserved. + +I registered on Gitpod but I still cannot see a ``Gitpod`` button in my repositories. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Head to https://gitpod.io/integrations and make sure you are logged in. +Hover over GitHub and click on the three buttons that appear on the right. +Click on edit permissions and make sure you have ``user:email``, +``read:user``, and ``public_repo`` checked. Click on **Update Permissions** +and confirm the changes in the GitHub application page. + +.. image:: ./gitpod-imgs/gitpod-edit-permissions-gh.png + :alt: Gitpod integrations - edit GH permissions screenshot + +How long does my workspace stay active if I'm not using it? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you keep your workspace open in a browser tab but don't interact with it, +it will shut down after 30 minutes. If you close the browser tab, it will +shut down after 3 minutes. + +My terminal is blank - there is no cursor and it's completely unresponsive +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unfortunately this is a known-issue on Gitpod's side. You can sort this +issue in two ways: + +#. Create a new Gitpod workspace altogether. +#. Head to your `Gitpod dashboard `_ and locate + the running workspace. Hover on it and click on the **three dots menu** + and then click on **Stop**. When the workspace is completely stopped you + can click on its name to restart it again. + +.. image:: ./gitpod-imgs/gitpod-dashboard-stop.png + :alt: Gitpod dashboard and workspace menu screenshot + +I authenticated through GitHub but I still cannot commit to the repository through Gitpod. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Head to https://gitpod.io/integrations and make sure you are logged in. +Hover over GitHub and click on the three buttons that appear on the right. +Click on edit permissions and make sure you have ``public_repo`` checked. +Click on **Update Permissions** and confirm the changes in the +GitHub application page. + +.. image:: ./gitpod-imgs/gitpod-edit-permissions-repo.png + :alt: Gitpod integrations - edit GH repository permissions screenshot + +Acknowledgments +--------------- + +This page is lightly adapted from the `NumPy`_ project . + +.. _Gitpod: https://www.gitpod.io/ +.. _pandas repository on GitHub: https://github.com/pandas-dev/pandas +.. _create your own fork: https://help.github.com/en/articles/fork-a-repo +.. _VSCode docs: https://code.visualstudio.com/docs/getstarted/tips-and-tricks +.. _NumPy: https://www.numpy.org/ diff --git a/doc/source/development/copy_on_write.rst b/doc/source/development/copy_on_write.rst new file mode 100644 index 0000000000000..9a2309b8a77a3 --- /dev/null +++ b/doc/source/development/copy_on_write.rst @@ -0,0 +1,42 @@ +.. _copy_on_write_dev: + +{{ header }} + +************* +Copy on write +************* + +Copy on Write is a mechanism to simplify the indexing API and improve +performance through avoiding copies if possible. +CoW means that any DataFrame or Series derived from another in any way always +behaves as a copy. An explanation on how to use Copy on Write efficiently can be +found :ref:`here `. + +Reference tracking +------------------ + +To be able to determine if we have to make a copy when writing into a DataFrame, +we have to be aware if the values are shared with another DataFrame. pandas +keeps track of all ``Blocks`` that share values with another block internally to +be able to tell when a copy needs to be triggered. The reference tracking +mechanism is implemented on the Block level. + +We use a custom reference tracker object, ``BlockValuesRefs``, that keeps +track of every block, whose values share memory with each other. The reference +is held through a weak-reference. Every pair of blocks that share some memory should +point to the same ``BlockValuesRefs`` object. If one block goes out of +scope, the reference to this block dies. As a consequence, the reference tracker +object always knows how many blocks are alive and share memory. + +Whenever a :class:`DataFrame` or :class:`Series` object is sharing data with another +object, it is required that each of those objects have its own BlockManager and Block +objects. Thus, in other words, one Block instance (that is held by a DataFrame, not +necessarily for intermediate objects) should always be uniquely used for only +a single DataFrame/Series object. For example, when you want to use the same +Block for another object, you can create a shallow copy of the Block instance +with ``block.copy(deep=False)`` (which will create a new Block instance with +the same underlying values and which will correctly set up the references). + +We can ask the reference tracking object if there is another block alive that shares +data with us before writing into the values. We can trigger a copy before +writing if there is in fact another block alive. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 7ba2091e18853..0ea1c112cb55b 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -6,88 +6,58 @@ Debugging C extensions ====================== -Pandas uses select C extensions for high performance IO operations. In case you need to debug segfaults or general issues with those extensions, the following steps may be helpful. +pandas uses Cython and C/C++ `extension modules `_ to optimize performance. Unfortunately, the standard Python debugger does not allow you to step into these extensions. Cython extensions can be debugged with the `Cython debugger `_ and C/C++ extensions can be debugged using the tools shipped with your platform's compiler. -First, be sure to compile the extensions with the appropriate flags to generate debug symbols and remove optimizations. This can be achieved as follows: +For Python developers with limited or no C/C++ experience this can seem a daunting task. Core developer Will Ayd has written a 3 part blog series to help guide you from the standard Python debugger into these other tools: -.. code-block:: sh - - python setup.py build_ext --inplace -j4 --with-debugging-symbols - -Using a debugger -================ - -Assuming you are on a Unix-like operating system, you can use either lldb or gdb to debug. The choice between either is largely dependent on your compilation toolchain - typically you would use lldb if using clang and gdb if using gcc. For macOS users, please note that ``gcc`` is on modern systems an alias for ``clang``, so if using Xcode you usually opt for lldb. Regardless of which debugger you choose, please refer to your operating systems instructions on how to install. - -After installing a debugger you can create a script that hits the extension module you are looking to debug. For demonstration purposes, let's assume you have a script called ``debug_testing.py`` with the following contents: - -.. code-block:: python - - import pandas as pd - - pd.DataFrame([[1, 2]]).to_json() - -Place the ``debug_testing.py`` script in the project root and launch a Python process under your debugger. If using lldb: - -.. code-block:: sh - - lldb python - -If using gdb: + 1. `Fundamental Python Debugging Part 1 - Python `_ + 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ + 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ -.. code-block:: sh - - gdb python +Debugging locally +----------------- -Before executing our script, let's set a breakpoint in our JSON serializer in its entry function called ``objToJSON``. The lldb syntax would look as follows: - -.. code-block:: sh +By default building pandas from source will generate a release build. To generate a development build you can type:: - breakpoint set --name objToJSON - -Similarly for gdb: - -.. code-block:: sh - - break objToJSON + pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" .. note:: - You may get a warning that this breakpoint cannot be resolved in lldb. gdb may give a similar warning and prompt you to make the breakpoint on a future library load, which you should say yes to. This should only happen on the very first invocation as the module you wish to debug has not yet been loaded into memory. - -Now go ahead and execute your script: + conda environments update CFLAGS/CPPFLAGS with flags that are geared towards generating releases. If using conda, you may need to set ``CFLAGS="$CFLAGS -O0"`` and ``CPPFLAGS="$CPPFLAGS -O0"`` to ensure optimizations are turned off for debugging -.. code-block:: sh +By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. - run .py +Using Docker +------------ -Code execution will halt at the breakpoint defined or at the occurrence of any segfault. LLDB's `GDB to LLDB command map `_ provides a listing of debugger command that you can execute using either debugger. +To simplify the debugging process, pandas has created a Docker image with a debug build of Python and the gdb/Cython debuggers pre-installed. You may either ``docker pull pandas/pandas-debug`` to get access to this image or build it from the ``tooling/debug`` folder locally. -Another option to execute the entire test suite under lldb would be to run the following: +You can then mount your pandas repository into this image via: .. code-block:: sh - lldb -- python -m pytest + docker run --rm -it -w /data -v ${PWD}:/data pandas/pandas-debug -Or for gdb +Inside the image, you can use meson to build/install pandas and place the build artifacts into a ``debug`` folder using a command as follows: .. code-block:: sh - gdb --args python -m pytest + python -m pip install -ve . --no-build-isolation -Cbuilddir="debug" -Csetup-args="-Dbuildtype=debug" -Once the process launches, simply type ``run`` and the test suite will begin, stopping at any segmentation fault that may occur. +If planning to use cygdb, the files required by that application are placed within the build folder. So you have to first ``cd`` to the build folder, then start that application. -Checking memory leaks with valgrind -=================================== +.. code-block:: sh -You can use `Valgrind `_ to check for and log memory leaks in extensions. For instance, to check for a memory leak in a test from the suite you can run: + cd debug + cygdb -.. code-block:: sh +Within the debugger you can use `cygdb commands `_ to navigate cython extensions. - PYTHONMALLOC=malloc valgrind --leak-check=yes --track-origins=yes --log-file=valgrind-log.txt python -m pytest +Editor support +-------------- -Note that code execution under valgrind will take much longer than usual. While you can run valgrind against extensions compiled with any optimization level, it is suggested to have optimizations turned off from compiled extensions to reduce the amount of false positives. The ``--with-debugging-symbols`` flag passed during package setup will do this for you automatically. +The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-definition and error checking support as you type. -.. note:: +How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used *debug* as your directory name, you can run:: - For best results, you should run use a Python installation configured with Valgrind support (--with-valgrind) + ln -s debug/compile_commands.json . diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index 6de237b70f08d..c5c4b7c449ce7 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -99,7 +99,7 @@ Column metadata * Boolean: ``'bool'`` * Integers: ``'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64'`` * Floats: ``'float16', 'float32', 'float64'`` -* Date and Time Types: ``'datetime', 'datetimetz'``, ``'timedelta'`` +* Date and Time Types: ``'datetime', 'datetimetz', 'timedelta'`` * String: ``'unicode', 'bytes'`` * Categorical: ``'categorical'`` * Other Python objects: ``'object'`` diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index c7286616672b9..e67829b8805eb 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -60,7 +60,7 @@ Now users can access your methods using the ``geo`` namespace: This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our -:ref:`ecosystem` page. +`ecosystem `_ page. We highly recommend validating the data in your accessor's ``__init__``. In our ``GeoAccessor``, we validate that the data contains the expected columns, @@ -91,7 +91,7 @@ objects). Many methods like :func:`pandas.isna` will dispatch to the extension type's implementation. If you're building a library that implements the interface, please publicize it -on :ref:`ecosystem.extensions`. +on `the ecosystem page `_. The interface consists of two classes. @@ -99,7 +99,7 @@ The interface consists of two classes. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A :class:`pandas.api.extensions.ExtensionDtype` is similar to a ``numpy.dtype`` object. It describes the -data type. Implementors are responsible for a few unique items like the name. +data type. Implementers are responsible for a few unique items like the name. One particularly important item is the ``type`` property. This should be the class that is the scalar type for your data. For example, if you were writing an @@ -450,7 +450,7 @@ Below is an example to define two original properties, "internal_cache" as a tem Plotting backends ----------------- -Starting in 0.25 pandas can be extended with third-party plotting backends. The +pandas can be extended with third-party plotting backends. The main idea is letting users select a plotting backend different than the provided one based on Matplotlib. For example: @@ -488,3 +488,49 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at https://github.com/pandas-dev/pandas/blob/main/pandas/plotting/__init__.py#L1. + +.. _extending.pandas_priority: + +Arithmetic with 3rd party types +------------------------------- + +In order to control how arithmetic works between a custom type and a pandas type, +implement ``__pandas_priority__``. Similar to numpy's ``__array_priority__`` +semantics, arithmetic methods on :class:`DataFrame`, :class:`Series`, and :class:`Index` +objects will delegate to ``other``, if it has an attribute ``__pandas_priority__`` with a higher value. + +By default, pandas objects try to operate with other objects, even if they are not types known to pandas: + +.. code-block:: python + + >>> pd.Series([1, 2]) + [10, 20] + 0 11 + 1 22 + dtype: int64 + +In the example above, if ``[10, 20]`` was a custom type that can be understood as a list, pandas objects will still operate with it in the same way. + +In some cases, it is useful to delegate to the other type the operation. For example, consider I implement a +custom list object, and I want the result of adding my custom list with a pandas :class:`Series` to be an instance of my list +and not a :class:`Series` as seen in the previous example. This is now possible by defining the ``__pandas_priority__`` attribute +of my custom list, and setting it to a higher value, than the priority of the pandas objects I want to operate with. + +The ``__pandas_priority__`` of :class:`DataFrame`, :class:`Series`, and :class:`Index` are ``4000``, ``3000``, and ``2000`` respectively. The base ``ExtensionArray.__pandas_priority__`` is ``1000``. + +.. code-block:: python + + class CustomList(list): + __pandas_priority__ = 5000 + + def __radd__(self, other): + # return `self` and not the addition for simplicity + return self + + custom = CustomList() + series = pd.Series([1, 2, 3]) + + # Series refuses to add custom, since it's an unknown type with higher priority + assert series.__add__(custom) is NotImplemented + + # This will cause the custom class `__radd__` being used instead + assert series + custom is custom diff --git a/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png b/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png new file mode 100644 index 0000000000000..b64790a986646 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-dashboard-stop.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png new file mode 100644 index 0000000000000..ec21a9064c83d Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-gh.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png new file mode 100644 index 0000000000000..8bfaff81cfb69 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-edit-permissions-repo.png differ diff --git a/doc/source/development/gitpod-imgs/gitpod-workspace.png b/doc/source/development/gitpod-imgs/gitpod-workspace.png new file mode 100644 index 0000000000000..daf763e9adb05 Binary files /dev/null and b/doc/source/development/gitpod-imgs/gitpod-workspace.png differ diff --git a/doc/source/development/gitpod-imgs/pandas-github.png b/doc/source/development/gitpod-imgs/pandas-github.png new file mode 100644 index 0000000000000..010b0fc5ea33d Binary files /dev/null and b/doc/source/development/gitpod-imgs/pandas-github.png differ diff --git a/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png b/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png new file mode 100644 index 0000000000000..f95c66056ca37 Binary files /dev/null and b/doc/source/development/gitpod-imgs/pandas-gitpod-branches.png differ diff --git a/doc/source/development/gitpod-imgs/rst-rendering.png b/doc/source/development/gitpod-imgs/rst-rendering.png new file mode 100644 index 0000000000000..b613c621c398b Binary files /dev/null and b/doc/source/development/gitpod-imgs/rst-rendering.png differ diff --git a/doc/source/development/gitpod-imgs/vscode-rst.png b/doc/source/development/gitpod-imgs/vscode-rst.png new file mode 100644 index 0000000000000..5b574c115a2b7 Binary files /dev/null and b/doc/source/development/gitpod-imgs/vscode-rst.png differ diff --git a/doc/source/development/gitpod-imgs/vscode-statusbar.png b/doc/source/development/gitpod-imgs/vscode-statusbar.png new file mode 100644 index 0000000000000..dad25369fedfd Binary files /dev/null and b/doc/source/development/gitpod-imgs/vscode-statusbar.png differ diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 1dbe162cd1a6b..aa7e7845bfa7a 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -18,9 +18,9 @@ Development contributing_codebase maintaining internals + copy_on_write debugging_extensions extending developer policies - roadmap - meeting + community diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index cec385dd087db..e3468746ce177 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -15,56 +15,31 @@ Indexing In pandas there are a few objects implemented which can serve as valid containers for the axis labels: -* ``Index``: the generic "ordered set" object, an ndarray of object dtype +* :class:`Index`: the generic "ordered set" object, an ndarray of object dtype assuming nothing about its contents. The labels must be hashable (and likely immutable) and unique. Populates a dict of label to location in Cython to do ``O(1)`` lookups. -* ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer - data, such as time stamps -* ``Float64Index``: a version of ``Index`` highly optimized for 64-bit float data -* ``MultiIndex``: the standard hierarchical index object -* ``DatetimeIndex``: An Index object with ``Timestamp`` boxed elements (impl are the int64 values) -* ``TimedeltaIndex``: An Index object with ``Timedelta`` boxed elements (impl are the in64 values) -* ``PeriodIndex``: An Index object with Period elements +* :class:`MultiIndex`: the standard hierarchical index object +* :class:`DatetimeIndex`: An Index object with :class:`Timestamp` boxed elements (impl are the int64 values) +* :class:`TimedeltaIndex`: An Index object with :class:`Timedelta` boxed elements (impl are the in64 values) +* :class:`PeriodIndex`: An Index object with Period elements There are functions that make the creation of a regular index easy: -* ``date_range``: fixed frequency date range generated from a time rule or +* :func:`date_range`: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects -* ``period_range``: fixed frequency date range generated from a time rule or - DateOffset. An ndarray of ``Period`` objects, representing timespans - -The motivation for having an ``Index`` class in the first place was to enable -different implementations of indexing. This means that it's possible for you, -the user, to implement a custom ``Index`` subclass that may be better suited to -a particular application than the ones provided in pandas. - -From an internal implementation point of view, the relevant methods that an -``Index`` must define are one or more of the following (depending on how -incompatible the new object internals are with the ``Index`` functions): - -* ``get_loc``: returns an "indexer" (an integer, or in some cases a - slice object) for a label -* ``slice_locs``: returns the "range" to slice between two labels -* ``get_indexer``: Computes the indexing vector for reindexing / data - alignment purposes. See the source / docstrings for more on this -* ``get_indexer_non_unique``: Computes the indexing vector for reindexing / data - alignment purposes when the index is non-unique. See the source / docstrings - for more on this -* ``reindex``: Does any pre-conversion of the input index then calls - ``get_indexer`` -* ``union``, ``intersection``: computes the union or intersection of two - Index objects -* ``insert``: Inserts a new label into an Index, yielding a new object -* ``delete``: Delete a label, yielding a new object -* ``drop``: Deletes a set of labels -* ``take``: Analogous to ndarray.take +* :func:`period_range`: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of :class:`Period` objects, representing timespans + +.. warning:: + + Custom :class:`Index` subclasses are not supported, custom behavior should be implemented using the :class:`ExtensionArray` interface instead. MultiIndex ~~~~~~~~~~ -Internally, the ``MultiIndex`` consists of a few things: the **levels**, the -integer **codes** (until version 0.24 named *labels*), and the level **names**: +Internally, the :class:`MultiIndex` consists of a few things: the **levels**, the +integer **codes**, and the level **names**: .. ipython:: python @@ -80,13 +55,13 @@ You can probably guess that the codes determine which unique element is identified with that location at each layer of the index. It's important to note that sortedness is determined **solely** from the integer codes and does not check (or care) whether the levels themselves are sorted. Fortunately, the -constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but -if you compute the levels and codes yourself, please be careful. +constructors :meth:`~MultiIndex.from_tuples` and :meth:`~MultiIndex.from_arrays` ensure +that this is true, but if you compute the levels and codes yourself, please be careful. Values ~~~~~~ -pandas extends NumPy's type system with custom types, like ``Categorical`` or +pandas extends NumPy's type system with custom types, like :class:`Categorical` or datetimes with a timezone, so we have multiple notions of "values". For 1-D containers (``Index`` classes and ``Series``) we have the following convention: diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 1bff2eccd3d27..c572559dcc3e0 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -44,6 +44,9 @@ reading. Issue triage ------------ +Triage is an important first step in addressing issues reported by the community, and even +partial contributions are a great way to help maintain pandas. Only remove the "Needs Triage" +tag once all of the steps below have been completed. Here's a typical workflow for triaging a newly opened issue. @@ -67,9 +70,9 @@ Here's a typical workflow for triaging a newly opened issue. 3. **Is this a duplicate issue?** We have many open issues. If a new issue is clearly a duplicate, label the - new issue as "Duplicate" assign the milestone "No Action", and close the issue - with a link to the original issue. Make sure to still thank the reporter, and - encourage them to chime in on the original issue, and perhaps try to fix it. + new issue as "Duplicate" and close the issue with a link to the original issue. + Make sure to still thank the reporter, and encourage them to chime in on the + original issue, and perhaps try to fix it. If the new issue provides relevant information, such as a better or slightly different example, add it to the original issue as a comment or an edit to @@ -81,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue. example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports for a good explanation. If the example is not reproducible, or if it's *clearly* not minimal, feel free to ask the reporter if they can provide - and example or simplify the provided one. Do acknowledge that writing + an example or simplify the provided one. Do acknowledge that writing minimal reproducible examples is hard work. If the reporter is struggling, you can try to write one yourself and we'll edit the original post to include it. @@ -90,6 +93,13 @@ Here's a typical workflow for triaging a newly opened issue. If a reproducible example is provided, but you see a simplification, edit the original post with your simpler reproducible example. + If this is a regression report, post the result of a ``git bisect`` run. + More info on this can be found in the :ref:`maintaining.regressions` section. + + Ensure the issue exists on the main branch and that it has the "Needs Triage" tag + until all steps have been completed. Add a comment to the issue once you have + verified it exists on the main branch, so others know it has been confirmed. + 5. **Is this a clearly defined feature request?** Generally, pandas prefers to discuss and design new features in issues, before @@ -97,8 +107,9 @@ Here's a typical workflow for triaging a newly opened issue. for the new feature. Having them write a full docstring is a good way to pin down specifics. - We'll need a discussion from several pandas maintainers before deciding whether - the proposal is in scope for pandas. + Tag new feature requests with "Needs Discussion", as we'll need a discussion + from several pandas maintainers before deciding whether the proposal is in + scope for pandas. 6. **Is this a usage question?** @@ -117,9 +128,53 @@ Here's a typical workflow for triaging a newly opened issue. If the issue is clearly defined and the fix seems relatively straightforward, label the issue as "Good first issue". - Typically, new issues will be assigned the "Contributions welcome" milestone, - unless it's know that this issue should be addressed in a specific release (say - because it's a large regression). + If the issue is a regression report, add the "Regression" label and the next patch + release milestone. + + Once you have completed the above, make sure to remove the "Needs Triage" label. + +.. _maintaining.regressions: + +Investigating regressions +------------------------- + +Regressions are bugs that unintentionally break previously working code. The common way +to investigate regressions is by using +`git bisect `_, +which finds the first commit that introduced the bug. + +For example: a user reports that ``pd.Series([1, 1]).sum()`` returns ``3`` +in pandas version ``1.5.0`` while in version ``1.4.0`` it returned ``2``. To begin, +create a file ``t.py`` in your pandas directory, which contains + +.. code-block:: python + + import pandas as pd + assert pd.Series([1, 1]).sum() == 2 + +and then run:: + + git bisect start + git bisect good v1.4.0 + git bisect bad v1.5.0 + git bisect run bash -c "python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true; python t.py" + +This finds the first commit that changed the behavior. The C extensions have to be +rebuilt at every step, so the search can take a while. + +Exit bisect and rebuild the current version:: + + git bisect reset + python -m pip install -ve . --no-build-isolation -Ceditable-verbose=true + +Report your findings under the corresponding issue and ping the commit author to get +their input. + +.. note:: + In the ``bisect run`` command above, commits are considered good if ``t.py`` exits + with ``0`` and bad otherwise. When raising an exception is the desired behavior, + wrap the code in an appropriate ``try/except`` statement. See :issue:`35685` for + more examples. .. _maintaining.closing: @@ -131,6 +186,8 @@ conversation is over. It's typically best to give the reporter some time to respond or self-close their issue if it's determined that the behavior is not a bug, or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. +If you think an issue should be closed but are not completely sure, please apply +the "closing candidate" label and wait for other maintainers to take a look. .. _maintaining.reviewing: @@ -167,15 +224,15 @@ pandas supports point releases (e.g. ``1.4.3``) that aim to: * e.g. If a feature worked in ``1.2`` and stopped working since ``1.3``, a fix can be applied in ``1.4.3``. -Since pandas minor releases are based on Github branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch), +Since pandas minor releases are based on GitHub branches (e.g. point release of ``1.4`` are based off the ``1.4.x`` branch), "backporting" means merging a pull request fix to the ``main`` branch and correct minor branch associated with the next point release. -By default, if a pull request is assigned to the next point release milestone within the Github interface, +By default, if a pull request is assigned to the next point release milestone within the GitHub interface, the backporting process should happen automatically by the ``@meeseeksdev`` bot once the pull request is merged. A new pull request will be made backporting the pull request to the correct version branch. Sometimes due to merge conflicts, a manual pull request will need to be made addressing the code conflict. -If the bot does not automatically start the backporting process, you can also write a Github comment in the merged pull request +If the bot does not automatically start the backporting process, you can also write a GitHub comment in the merged pull request to trigger the backport:: @meeseeksdev backport version-branch @@ -209,14 +266,16 @@ Cleaning up old pull requests Occasionally, contributors are unable to finish off a pull request. If some time has passed (two weeks, say) since the last review requesting changes, gently ask if they're still interested in working on this. If another two weeks or -so passes with no response, thank them for their work and close the pull request. -Comment on the original issue that "There's a stalled PR at #1234 that may be -helpful.", and perhaps label the issue as "Good first issue" if the PR was relatively -close to being accepted. +so passes with no response, thank them for their work and then either: -Additionally, core-team members can push to contributors branches. This can be -helpful for pushing an important PR across the line, or for fixing a small -merge conflict. +- close the pull request; +- push to the contributor's branch to push their work over the finish line (if + you're part of ``pandas-core``). This can be helpful for pushing an important PR + across the line, or for fixing a small merge conflict. + +If closing the pull request, then please comment on the original issue that +"There's a stalled PR at #1234 that may be helpful.", and perhaps label the issue +as "Good first issue" if the PR was relatively close to being accepted. Becoming a pandas maintainer ---------------------------- @@ -228,17 +287,18 @@ being helpful on the issue tracker. The required steps for adding a maintainer are: 1. Contact the contributor and ask their interest to join. -2. Add the contributor to the appropriate `Github Team `_ if accepted the invitation. +2. Add the contributor to the appropriate `GitHub Team `_ if accepted the invitation. * ``pandas-core`` is for core team members * ``pandas-triage`` is for pandas triage members +If adding to ``pandas-core``, there are two additional steps: + 3. Add the contributor to the pandas Google group. -4. Create a pull request to add the contributor's Github handle to ``pandas-dev/pandas/web/pandas/config.yml``. -5. Create a pull request to add the contributor's name/Github handle to the `governance document `_. +4. Create a pull request to add the contributor's GitHub handle to ``pandas-dev/pandas/web/pandas/config.yml``. The current list of core-team members is at -https://github.com/pandas-dev/pandas-governance/blob/master/people.md +https://github.com/pandas-dev/pandas/blob/main/web/pandas/config.yml .. _maintaining.merging: @@ -248,8 +308,11 @@ Merging pull requests Only core team members can merge pull requests. We have a few guidelines. -1. You should typically not self-merge your own pull requests. Exceptions include - things like small changes to fix CI (e.g. pinning a package version). +1. You should typically not self-merge your own pull requests without approval. + Exceptions include things like small changes to fix CI + (e.g. pinning a package version). Self-merging with approval from other + core team members is fine if the change is something you're very confident + about. 2. You should not merge pull requests that have an active discussion, or pull requests that has any ``-1`` votes from a core maintainer. pandas operates by consensus. @@ -269,40 +332,166 @@ a milestone before tagging, you can request the bot to backport it with: @Meeseeksdev backport -.. _maintaining.asv-machine: +.. _maintaining.release: -Benchmark machine ------------------ +Release process +--------------- -The team currently owns dedicated hardware for hosting a website for pandas' ASV performance benchmark. The results -are published to http://pandas.pydata.org/speed/pandas/ +The release process makes a snapshot of pandas (a git commit) available to users with +a particular version number. After the release the new pandas version will be available +in the next places: -Configuration +- Git repo with a `new tag `_ +- Source distribution in a `GitHub release `_ +- Pip packages in the `PyPI `_ +- Conda packages in `conda-forge `_ + +The process for releasing a new version of pandas is detailed next section. + +The instructions contain ```` which needs to be replaced with the version +to be released (e.g. ``1.5.2``). Also the branch to be released ````, which +depends on whether the version being released is the release candidate of a new version, +or any other version. Release candidates are released from ``main``, while other +versions are released from their branch (e.g. ``1.5.x``). + + +Prerequisites ````````````` -The machine can be configured with the `Ansible `_ playbook in https://github.com/tomaugspurger/asv-runner. +In order to be able to release a new pandas version, the next permissions are needed: -Publishing -`````````` +- Merge rights to the `pandas `_ and + `pandas-feedstock `_ repositories. + For the latter, open a PR adding your GitHub username to the conda-forge recipe. +- Permissions to push to ``main`` in the pandas repository, to push the new tags. +- `Write permissions to PyPI `_. +- Access to our website / documentation server. Share your public key with the + infrastructure committee to be added to the ``authorized_keys`` file of the main + server user. +- Access to the social media accounts, to publish the announcements. -The results are published to another Github repository, https://github.com/tomaugspurger/asv-collection. -Finally, we have a cron job on our docs server to pull from https://github.com/tomaugspurger/asv-collection, to serve them from ``/speed``. -Ask Tom or Joris for access to the webserver. +Pre-release +``````````` -Debugging -````````` +1. Agree with the core team on the next topics: -The benchmarks are scheduled by Airflow. It has a dashboard for viewing and debugging the results. You'll need to setup an SSH tunnel to view them + - Release date (major/minor releases happen usually every 6 months, and patch releases + monthly until x.x.5, just before the next major/minor) + - Blockers (issues and PRs that must be part of the release) + - Next version after the one being released - ssh -L 8080:localhost:8080 pandas@panda.likescandy.com +2. Update and clean release notes for the version to be released, including: + - Set the final date of the release + - Remove any unused bullet point + - Make sure there are no formatting issues, typos, etc. -.. _maintaining.release: +3. Make sure the CI is green for the last commit of the branch being released. -Release process ---------------- +4. If not a release candidate, make sure all backporting pull requests to the branch + being released are merged. + +5. Create a new issue and milestone for the version after the one being released. + If the release was a release candidate, we would usually want to create issues and + milestones for both the next major/minor, and the next patch release. In the + milestone of a patch release, we add the description ``on-merge: backport to ``, + so tagged PRs are automatically backported to the release branch by our bot. + +6. Change the milestone of all issues and PRs in the milestone being released to the + next milestone. + +Release +``````` + +1. Create an empty commit and a tag in the last commit of the branch to be released:: + + git checkout + git pull --ff-only upstream + git clean -xdf + git commit --allow-empty --author="pandas Development Team " -m "RLS: " + git tag -a v -m "Version " # NOTE that the tag is v1.5.2 with "v" not 1.5.2 + git push upstream --follow-tags + +The docs for the new version will be built and published automatically with the docs job in the CI, +which will be triggered when the tag is pushed. + +2. Only if the release is a release candidate, we want to create a new branch for it, immediately + after creating the tag. For example, if we are releasing pandas 1.4.0rc0, we would like to + create the branch 1.4.x to backport commits to the 1.4 versions. As well as create a tag to + mark the start of the development of 1.5.0 (assuming it is the next version):: + + git checkout -b 1.4.x + git push upstream 1.4.x + git checkout main + git commit --allow-empty -m "Start 1.5.0" + git tag -a v1.5.0.dev0 -m "DEV: Start 1.5.0" + git push upstream main --follow-tags + +3. Download the source distribution and wheels from the `wheel staging area `_. + Be careful to make sure that no wheels are missing (e.g. due to failed builds). + + Running scripts/download_wheels.sh with the version that you want to download wheels/the sdist for should do the trick. + This script will make a ``dist`` folder inside your clone of pandas and put the downloaded wheels and sdist there:: + + scripts/download_wheels.sh + +4. Create a `new GitHub release `_: + + - Tag: ```` + - Title: ``pandas `` + - Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) + - Files: ``pandas-.tar.gz`` source distribution just generated + - Set as a pre-release: Only check for a release candidate + - Set as the latest release: Leave checked, unless releasing a patch release for an older version + (e.g. releasing 1.4.5 after 1.5 has been released) + +5. Upload wheels to PyPI:: + + twine upload pandas/dist/pandas-*.{whl,tar.gz} --skip-existing + +6. The GitHub release will after some hours trigger an + `automated conda-forge PR `_. + (If you don't want to wait, you can open an issue titled ``@conda-forge-admin, please update version`` to trigger the bot.) + Merge it once the CI is green, and it will generate the conda-forge packages. + + In case a manual PR needs to be done, the version, sha256 and build fields are the + ones that usually need to be changed. If anything else in the recipe has changed since + the last release, those changes should be available in ``ci/meta.yaml``. + +Post-Release +```````````` + +1. Update symlinks to stable documentation by logging in to our web server, and + editing ``/var/www/html/pandas-docs/stable`` to point to ``version/`` + for major and minor releases, or ``version/`` to ``version/`` for + patch releases. The exact instructions are (replace the example version numbers by + the appropriate ones for the version you are releasing): + + - Log in to the server and use the correct user. + - ``cd /var/www/html/pandas-docs/`` + - ``ln -sfn version/2.1 stable`` (for a major or minor release) + - ``ln -sfn version/2.0.3 version/2.0`` (for a patch release) + +2. If releasing a major or minor release, open a PR in our source code to update + ``web/pandas/versions.json``, to have the desired versions in the documentation + dropdown menu. + +3. Close the milestone and the issue for the released version. + +4. Create a new issue for the next release, with the estimated date of release. + +5. Open a PR with the placeholder for the release notes of the next version. See + for example `the PR for 1.5.3 `_. + Note that the template to use depends on whether it is a major, minor or patch release. + +6. Announce the new release in the official channels (use previous announcements + for reference): + + - The pandas-dev and pydata mailing lists + - X, Mastodon, Telegram and LinkedIn -The process for releasing a new version of pandas can be found at https://github.com/pandas-dev/pandas-release +7. Update this release instructions to fix anything incorrect and to update about any + change since the last release. -.. _governance documents: https://github.com/pandas-dev/pandas-governance +.. _governance documents: https://github.com/pandas-dev/pandas/blob/main/web/pandas/about/governance.md .. _list of permissions: https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-roles-for-an-organization diff --git a/doc/source/development/meeting.rst b/doc/source/development/meeting.rst deleted file mode 100644 index 35826af5912c2..0000000000000 --- a/doc/source/development/meeting.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. _meeting: - -================== -Developer meetings -================== - -We hold regular developer meetings on the second Wednesday -of each month at 18:00 UTC. These meetings and their minutes are open to -the public. All are welcome to join. - -Minutes -------- - -The minutes of past meetings are available in `this Google Document `__. - -Calendar --------- - -This calendar shows all the developer meetings. - -.. raw:: html - - - -You can subscribe to this calendar with the following links: - -* `iCal `__ -* `Google calendar `__ - -Additionally, we'll sometimes have one-off meetings on specific topics. -These will be published on the same calendar. diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index d75262c08dfd6..a3665c5bb4d1f 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -9,8 +9,6 @@ Policies Version policy ~~~~~~~~~~~~~~ -.. versionchanged:: 1.0.0 - pandas uses a loose variant of semantic versioning (`SemVer`_) to govern deprecations, API compatibility, and version numbering. @@ -48,10 +46,16 @@ deprecation removed in the next major release (2.0.0). These policies do not apply to features marked as **experimental** in the documentation. pandas may change the behavior of experimental features at any time. +.. _policies.python_support: + Python support ~~~~~~~~~~~~~~ -pandas mirrors the `NumPy guidelines for Python support `__. +pandas mirrors the `SPEC 0 guideline for Python support `__. + +Security policy +~~~~~~~~~~~~~~~ +To report a security vulnerability to pandas, please go to https://github.com/pandas-dev/pandas/security/policy and see the instructions there. .. _SemVer: https://semver.org diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst deleted file mode 100644 index f935c27d9917d..0000000000000 --- a/doc/source/development/roadmap.rst +++ /dev/null @@ -1,250 +0,0 @@ -.. _roadmap: - -======= -Roadmap -======= - -This page provides an overview of the major themes in pandas' development. Each of -these items requires a relatively large amount of effort to implement. These may -be achieved more quickly with dedicated funding or interest from contributors. - -An item being on the roadmap does not mean that it will *necessarily* happen, even -with unlimited funding. During the implementation period we may discover issues -preventing the adoption of the feature. - -Additionally, an item *not* being on the roadmap does not exclude it from inclusion -in pandas. The roadmap is intended for larger, fundamental changes to the project that -are likely to take months or years of developer time. Smaller-scoped items will continue -to be tracked on our `issue tracker `__. - -See :ref:`roadmap.evolution` for proposing changes to this document. - -Extensibility -------------- - -pandas :ref:`extending.extension-types` allow for extending NumPy types with custom -data types and array storage. pandas uses extension types internally, and provides -an interface for 3rd-party libraries to define their own custom data types. - -Many parts of pandas still unintentionally convert data to a NumPy array. -These problems are especially pronounced for nested data. - -We'd like to improve the handling of extension arrays throughout the library, -making their behavior more consistent with the handling of NumPy arrays. We'll do this -by cleaning up pandas' internals and adding new methods to the extension array interface. - -String data type ----------------- - -Currently, pandas stores text data in an ``object`` -dtype NumPy array. -The current implementation has two primary drawbacks: First, ``object`` -dtype -is not specific to strings: any Python object can be stored in an ``object`` -dtype -array, not just strings. Second: this is not efficient. The NumPy memory model -isn't especially well-suited to variable width text data. - -To solve the first issue, we propose a new extension type for string data. This -will initially be opt-in, with users explicitly requesting ``dtype="string"``. -The array backing this string dtype may initially be the current implementation: -an ``object`` -dtype NumPy array of Python strings. - -To solve the second issue (performance), we'll explore alternative in-memory -array libraries (for example, Apache Arrow). As part of the work, we may -need to implement certain operations expected by pandas users (for example -the algorithm used in, ``Series.str.upper``). That work may be done outside of -pandas. - -Consistent missing value handling ---------------------------------- - -Currently, pandas handles missing data differently for different data types. We -use different types to indicate that a value is missing (``np.nan`` for -floating-point data, ``np.nan`` or ``None`` for object-dtype data -- typically -strings or booleans -- with missing values, and ``pd.NaT`` for datetimelike -data). Integer data cannot store missing data or are cast to float. In addition, -pandas 1.0 introduced a new missing value sentinel, ``pd.NA``, which is being -used for the experimental nullable integer, boolean, and string data types. - -These different missing values have different behaviors in user-facing -operations. Specifically, we introduced different semantics for the nullable -data types for certain operations (e.g. propagating in comparison operations -instead of comparing as False). - -Long term, we want to introduce consistent missing data handling for all data -types. This includes consistent behavior in all operations (indexing, arithmetic -operations, comparisons, etc.). There has been discussion of eventually making -the new semantics the default. - -This has been discussed at :issue:`28095` (and -linked issues), and described in more detail in this -`design doc `__. - -Apache Arrow interoperability ------------------------------ - -`Apache Arrow `__ is a cross-language development -platform for in-memory data. The Arrow logical types are closely aligned with -typical pandas use cases. - -We'd like to provide better-integrated support for Arrow memory and data types -within pandas. This will let us take advantage of its I/O capabilities and -provide for better interoperability with other languages and libraries -using Arrow. - -Block manager rewrite ---------------------- - -We'd like to replace pandas current internal data structures (a collection of -1 or 2-D arrays) with a simpler collection of 1-D arrays. - -pandas internal data model is quite complex. A DataFrame is made up of -one or more 2-dimensional "blocks", with one or more blocks per dtype. This -collection of 2-D arrays is managed by the BlockManager. - -The primary benefit of the BlockManager is improved performance on certain -operations (construction from a 2D array, binary operations, reductions across the columns), -especially for wide DataFrames. However, the BlockManager substantially increases the -complexity and maintenance burden of pandas. - -By replacing the BlockManager we hope to achieve - -* Substantially simpler code -* Easier extensibility with new logical types -* Better user control over memory use and layout -* Improved micro-performance -* Option to provide a C / Cython API to pandas' internals - -See `these design documents `__ -for more. - -Decoupling of indexing and internals ------------------------------------- - -The code for getting and setting values in pandas' data structures needs refactoring. -In particular, we must clearly separate code that converts keys (e.g., the argument -to ``DataFrame.loc``) to positions from code that uses these positions to get -or set values. This is related to the proposed BlockManager rewrite. Currently, the -BlockManager sometimes uses label-based, rather than position-based, indexing. -We propose that it should only work with positional indexing, and the translation of keys -to positions should be entirely done at a higher level. - -Indexing is a complicated API with many subtleties. This refactor will require care -and attention. The following principles should inspire refactoring of indexing code and -should result on cleaner, simpler, and more performant code. - -1. **Label indexing must never involve looking in an axis twice for the same label(s).** -This implies that any validation step must either: - - * limit validation to general features (e.g. dtype/structure of the key/index), or - * reuse the result for the actual indexing. - -2. **Indexers must never rely on an explicit call to other indexers.** -For instance, it is OK to have some internal method of ``.loc`` call some -internal method of ``__getitem__`` (or of their common base class), -but never in the code flow of ``.loc`` should ``the_obj[something]`` appear. - -3. **Execution of positional indexing must never involve labels** (as currently, sadly, happens). -That is, the code flow of a getter call (or a setter call in which the right hand side is non-indexed) -to ``.iloc`` should never involve the axes of the object in any way. - -4. **Indexing must never involve accessing/modifying values** (i.e., act on ``._data`` or ``.values``) **more than once.** -The following steps must hence be clearly decoupled: - - * find positions we need to access/modify on each axis - * (if we are accessing) derive the type of object we need to return (dimensionality) - * actually access/modify the values - * (if we are accessing) construct the return object - -5. As a corollary to the decoupling between 4.i and 4.iii, **any code which deals on how data is stored** -(including any combination of handling multiple dtypes, and sparse storage, categoricals, third-party types) -**must be independent from code that deals with identifying affected rows/columns**, -and take place only once step 4.i is completed. - - * In particular, such code should most probably not live in ``pandas/core/indexing.py`` - * ... and must not depend in any way on the type(s) of axes (e.g. no ``MultiIndex`` special cases) - -6. As a corollary to point 1.i, **``Index`` (sub)classes must provide separate methods for any desired validity check of label(s) which does not involve actual lookup**, -on the one side, and for any required conversion/adaptation/lookup of label(s), on the other. - -7. **Use of trial and error should be limited**, and anyway restricted to catch only exceptions -which are actually expected (typically ``KeyError``). - - * In particular, code should never (intentionally) raise new exceptions in the ``except`` portion of a ``try... exception`` - -8. **Any code portion which is not specific to setters and getters must be shared**, -and when small differences in behavior are expected (e.g. getting with ``.loc`` raises for -missing labels, setting still doesn't), they can be managed with a specific parameter. - -Numba-accelerated operations ----------------------------- - -`Numba `__ is a JIT compiler for Python code. We'd like to provide -ways for users to apply their own Numba-jitted functions where pandas accepts user-defined functions -(for example, :meth:`Series.apply`, :meth:`DataFrame.apply`, :meth:`DataFrame.applymap`, -and in groupby and window contexts). This will improve the performance of -user-defined-functions in these operations by staying within compiled code. - -Performance monitoring ----------------------- - -pandas uses `airspeed velocity `__ to -monitor for performance regressions. ASV itself is a fabulous tool, but requires -some additional work to be integrated into an open source project's workflow. - -The `asv-runner `__ organization, currently made up -of pandas maintainers, provides tools built on top of ASV. We have a physical -machine for running a number of project's benchmarks, and tools managing the -benchmark runs and reporting on results. - -We'd like to fund improvements and maintenance of these tools to - -* Be more stable. Currently, they're maintained on the nights and weekends when - a maintainer has free time. -* Tune the system for benchmarks to improve stability, following - https://pyperf.readthedocs.io/en/latest/system.html -* Build a GitHub bot to request ASV runs *before* a PR is merged. Currently, the - benchmarks are only run nightly. - -.. _roadmap.evolution: - -Roadmap evolution ------------------ - -pandas continues to evolve. The direction is primarily determined by community -interest. Everyone is welcome to review existing items on the roadmap and -to propose a new item. - -Each item on the roadmap should be a short summary of a larger design proposal. -The proposal should include - -1. Short summary of the changes, which would be appropriate for inclusion in - the roadmap if accepted. -2. Motivation for the changes. -3. An explanation of why the change is in scope for pandas. -4. Detailed design: Preferably with example-usage (even if not implemented yet) - and API documentation -5. API Change: Any API changes that may result from the proposal. - -That proposal may then be submitted as a GitHub issue, where the pandas maintainers -can review and comment on the design. The `pandas mailing list `__ -should be notified of the proposal. - -When there's agreement that an implementation -would be welcome, the roadmap should be updated to include the summary and a -link to the discussion issue. - -Completed items ---------------- - -This section records now completed items from the pandas roadmap. - -Documentation improvements -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -We improved the pandas documentation - -* The pandas community worked with others to build the `pydata-sphinx-theme`_, - which is now used for https://pandas.pydata.org/docs/ (:issue:`15556`). -* :ref:`getting_started` contains a number of resources intended for new - pandas users coming from a variety of backgrounds (:issue:`26831`). - -.. _pydata-sphinx-theme: https://github.com/pydata/pydata-sphinx-theme diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst deleted file mode 100644 index 166162a4763bf..0000000000000 --- a/doc/source/ecosystem.rst +++ /dev/null @@ -1,602 +0,0 @@ -:orphan: - -.. _ecosystem: - -{{ header }} - -**************** -pandas ecosystem -**************** - -Increasingly, packages are being built on top of pandas to address specific needs -in data preparation, analysis and visualization. -This is encouraging because it means pandas is not only helping users to handle -their data tasks but also that it provides a better starting point for developers to -build powerful and more focused data tools. -The creation of libraries that complement pandas' functionality also allows pandas -development to remain focused around it's original requirements. - -This is an inexhaustive list of projects that build on pandas in order to provide -tools in the PyData space. For a list of projects that depend on pandas, -see the -`Github network dependents for pandas `_ -or `search pypi for pandas `_. - -We'd like to make it easier for users to find these projects, if you know of other -substantial projects that you feel should be on this list, please let us know. - -.. _ecosystem.data_cleaning_and_validation: - -Data cleaning and validation ----------------------------- - -`Pyjanitor `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pyjanitor provides a clean API for cleaning data, using method chaining. - -`Pandera `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandera provides a flexible and expressive API for performing data validation on dataframes -to make data processing pipelines more readable and robust. -Dataframes contain information that pandera explicitly validates at runtime. This is useful in -production-critical data pipelines or reproducible research settings. - -`pandas-path `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since Python 3.4, `pathlib `_ has been -included in the Python standard library. Path objects provide a simple -and delightful way to interact with the file system. The pandas-path package enables the -Path API for pandas through a custom accessor ``.path``. Getting just the filenames from -a series of full file paths is as simple as ``my_files.path.name``. Other convenient operations like -joining paths, replacing file extensions, and checking if files exist are also available. - -.. _ecosystem.stats: - -Statistics and machine learning -------------------------------- - -`pandas-tfrecords `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas. - -`Statsmodels `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Statsmodels is the prominent Python "statistics and econometrics library" and it has -a long-standing special relationship with pandas. Statsmodels provides powerful statistics, -econometrics, analysis and modeling functionality that is out of pandas' scope. -Statsmodels leverages pandas objects as the underlying data container for computation. - -`sklearn-pandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use pandas DataFrames in your `scikit-learn `__ -ML pipeline. - -`Featuretools `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. - -`Compose `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. - -`STUMPY `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -STUMPY is a powerful and scalable Python library for modern time series analysis. -At its core, STUMPY efficiently computes something called a -`matrix profile `__, -which can be used for a wide variety of time series data mining tasks. - -.. _ecosystem.visualization: - -Visualization -------------- - -`Pandas has its own Styler class for table visualization `_, and while -:ref:`pandas also has built-in support for data visualization through charts with matplotlib `, -there are a number of other pandas-compatible libraries. - -`Altair `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Altair is a declarative statistical visualization library for Python. -With Altair, you can spend more time understanding your data and its -meaning. Altair's API is simple, friendly and consistent and built on -top of the powerful Vega-Lite JSON specification. This elegant -simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with pandas DataFrames. - - -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Bokeh is a Python interactive visualization library for large datasets that natively uses -the latest web technologies. Its goal is to provide elegant, concise construction of novel -graphics in the style of Protovis/D3, while delivering high-performance interactivity over -large data to thin clients. - -`Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native pandas plotting backend via - -.. code:: python - - pd.set_option("plotting.backend", "pandas_bokeh") - -It is very similar to the matplotlib plotting backend, but provides interactive -web-based charts and maps. - - -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Seaborn is a Python visualization library based on -`matplotlib `__. It provides a high-level, dataset-oriented -interface for creating attractive statistical graphics. The plotting functions -in seaborn understand pandas objects and leverage pandas grouping operations -internally to support concise specification of complex visualizations. Seaborn -also goes beyond matplotlib and pandas with the option to perform statistical -estimation while plotting, aggregating across observations and visualizing the -fit of statistical models to emphasize patterns in a dataset. - -`plotnine `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. -Based on `"The Grammar of Graphics" `__ it -provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. -Various implementations to other languages are available. -A good implementation for Python users is `has2k1/plotnine `__. - -`IPython vega `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`IPython Vega `__ leverages `Vega -`__ to create plots within Jupyter Notebook. - -`Plotly `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. - -`Lux `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: - -.. code:: python - - import lux - import pandas as pd - - df = pd.read_csv("data.csv") - df # discover interesting insights! - -By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. - -`Qtpandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Spun off from the main pandas library, the `qtpandas `__ -library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. - -`D-Tale `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -D-Tale is a lightweight web client for visualizing pandas data structures. It -provides a rich spreadsheet-style grid which acts as a wrapper for a lot of -pandas functionality (query, sort, describe, corr...) so users can quickly -manipulate their data. There is also an interactive chart-builder using Plotly -Dash allowing users to build nice portable visualizations. D-Tale can be -invoked with the following command - -.. code:: python - - import dtale - - dtale.show(df) - -D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle -& Google Colab. Here are some demos of the `grid `__. - -`hvplot `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. -It can be loaded as a native pandas plotting backend via - -.. code:: python - - pd.set_option("plotting.backend", "hvplot") - -.. _ecosystem.ide: - -IDE ---- - -`IPython `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with pandas methods and also -attributes like DataFrame columns. - -`Jupyter Notebook / Jupyter Lab `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Jupyter Notebook is a web application for creating Jupyter notebooks. -A Jupyter notebook is a JSON document containing an ordered list -of input/output cells which can contain code, text, mathematics, plots -and rich media. -Jupyter notebooks can be converted to a number of open standard output formats -(HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, -Python) through 'Download As' in the web interface and ``jupyter convert`` -in a shell. - -pandas DataFrames implement ``_repr_html_`` and ``_repr_latex`` methods -which are utilized by Jupyter Notebook for displaying -(abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. -(Note: HTML tables may or may not be -compatible with non-HTML Jupyter output formats.) - -See :ref:`Options and Settings ` and -:ref:`Available Options ` -for pandas ``display.`` settings. - -`Quantopian/qgrid `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -qgrid is "an interactive grid for sorting and filtering -DataFrames in IPython Notebook" built with SlickGrid. - -`Spyder `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Spyder is a cross-platform PyQt-based IDE combining the editing, analysis, -debugging and profiling functionality of a software development tool with the -data exploration, interactive execution, deep inspection and rich visualization -capabilities of a scientific environment like MATLAB or Rstudio. - -Its `Variable Explorer `__ -allows users to view, manipulate and edit pandas ``Index``, ``Series``, -and ``DataFrame`` objects like a "spreadsheet", including copying and modifying -values, sorting, displaying a "heatmap", converting data types and more. -pandas objects can also be renamed, duplicated, new columns added, -copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. -Spyder can also import data from a variety of plain text and binary files -or the clipboard into a new pandas DataFrame via a sophisticated import wizard. - -Most pandas classes, methods and data attributes can be autocompleted in -Spyder's `Editor `__ and -`IPython Console `__, -and Spyder's `Help pane `__ can retrieve -and render Numpydoc documentation on pandas objects in rich text with Sphinx -both automatically and on-demand. - - -.. _ecosystem.api: - -API ---- - -`pandas-datareader `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``pandas-datareader`` is a remote data access library for pandas (PyPI:``pandas-datareader``). -It is based on functionality that was located in ``pandas.io.data`` and ``pandas.io.wb`` but was -split off in v0.19. -See more in the `pandas-datareader docs `_: - -The following data feeds are available: - - * Google Finance - * Tiingo - * Morningstar - * IEX - * Robinhood - * Enigma - * Quandl - * FRED - * Fama/French - * World Bank - * OECD - * Eurostat - * TSP Fund Data - * Nasdaq Trader Symbol Definitions - * Stooq Index Data - * MOEX Data - -`Quandl/Python `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Quandl API for Python wraps the Quandl REST API to return -pandas DataFrames with timeseries indexes. - -`Pydatastream `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -PyDatastream is a Python interface to the -`Refinitiv Datastream (DWS) `__ -REST API to return indexed pandas DataFrames with financial data. -This package requires valid credentials for this API (non free). - -`pandaSDMX `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -pandaSDMX is a library to retrieve and acquire statistical data -and metadata disseminated in -`SDMX `_ 2.1, an ISO-standard -widely used by institutions such as statistics offices, central banks, -and international organisations. pandaSDMX can expose datasets and related -structural metadata including data flows, code-lists, -and data structure definitions as pandas Series -or MultiIndexed DataFrames. - -`fredapi `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -fredapi is a Python interface to the `Federal Reserve Economic Data (FRED) `__ -provided by the Federal Reserve Bank of St. Louis. It works with both the FRED database and ALFRED database that -contains point-in-time data (i.e. historic data revisions). fredapi provides a wrapper in Python to the FRED -HTTP API, and also provides several convenient methods for parsing and analyzing point-in-time data from ALFRED. -fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that -you can obtain for free on the FRED website. - -`dataframe_sql `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``dataframe_sql`` is a Python package that translates SQL syntax directly into -operations on pandas DataFrames. This is useful when migrating from a database to -using pandas or for users more comfortable with SQL looking for a way to interface -with pandas. - - -.. _ecosystem.domain: - -Domain specific ---------------- - -`Geopandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Geopandas extends pandas data objects to include geographic information which support -geometric operations. If your work entails maps and geographical coordinates, and -you love pandas, you should take a close look at Geopandas. - -`staircase `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -staircase is a data analysis package, built upon pandas and numpy, for modelling and -manipulation of mathematical step functions. It provides a rich variety of arithmetic -operations, relational operations, logical operations, statistical operations and -aggregations for step functions defined over real numbers, datetime and timedelta domains. - - -`xarray `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -xarray brings the labeled data power of pandas to the physical sciences by -providing N-dimensional variants of the core pandas data structures. It aims to -provide a pandas-like and pandas-compatible toolkit for analytics on multi- -dimensional arrays, rather than the tabular data for which pandas excels. - - -.. _ecosystem.io: - -IO --- - -`BCPandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -BCPandas provides high performance writes from pandas to Microsoft SQL Server, -far exceeding the performance of the native ``df.to_sql`` method. Internally, it uses -Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. -Rigorously tested, it is a complete replacement for ``df.to_sql``. - -`Deltalake `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Deltalake python package lets you access tables stored in -`Delta Lake `__ natively in Python without the need to use Spark or -JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert -any Delta table into Pandas dataframe. - - -.. _ecosystem.out-of-core: - -Out-of-core ------------ - -`Blaze `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, -PySpark. - -`Cylon `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Cylon is a fast, scalable, distributed memory parallel runtime with a pandas -like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache -Arrow format to represent the data in-memory. Cylon DataFrame API implements -most of the core operators of pandas such as merge, filter, join, concat, -group-by, drop_duplicates, etc. These operators are designed to work across -thousands of cores to scale applications. It can interoperate with pandas -DataFrame by reading data from pandas or converting data to pandas so users -can selectively scale parts of their pandas DataFrame applications. - -.. code:: python - - from pycylon import read_csv, DataFrame, CylonEnv - from pycylon.net import MPIConfig - - # Initialize Cylon distributed environment - config: MPIConfig = MPIConfig() - env: CylonEnv = CylonEnv(config=config, distributed=True) - - df1: DataFrame = read_csv('/tmp/csv1.csv') - df2: DataFrame = read_csv('/tmp/csv2.csv') - - # Using 1000s of cores across the cluster to compute the join - df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) - - print(df3) - -`Dask `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dask is a flexible parallel computing library for analytics. Dask -provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. - -`Dask-ML `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. - -`Ibis `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). - - -`Koalas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. - -`Modin `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement -for pandas. This means that you can use Modin with existing pandas code or write -new code with the existing pandas API. Modin can leverage your entire machine or -cluster to speed up and scale your pandas workloads, including traditionally -time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, -``read_parquet``, etc.). - -.. code:: python - - # import pandas as pd - import modin.pandas as pd - - df = pd.read_csv("big.csv") # use all your cores! - -`Odo `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Odo provides a uniform API for moving data between different formats. It uses -pandas own ``read_csv`` for CSV IO and leverages many existing packages such as -PyTables, h5py, and pymongo to move data between non pandas formats. Its graph -based approach is also extensible by end users for custom formats that may be -too specific for the core of odo. - -`Pandarallel `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. -If also displays progress bars. - -.. code:: python - - from pandarallel import pandarallel - - pandarallel.initialize(progress_bar=True) - - # df.apply(func) - df.parallel_apply(func) - - -`Vaex `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). - - * vaex.from_pandas - * vaex.to_pandas_df - -.. _ecosystem.extensions: - -Extension data types --------------------- - -pandas provides an interface for defining -:ref:`extension types ` to extend NumPy's type -system. The following libraries implement that interface to provide types not -found in NumPy or pandas, which work well with pandas' data containers. - -`Cyberpandas`_ -~~~~~~~~~~~~~~ - -Cyberpandas provides an extension type for storing arrays of IP Addresses. These -arrays can be stored inside pandas' Series and DataFrame. - -`Pandas-Genomics`_ -~~~~~~~~~~~~~~~~~~ - -Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data - -`Pint-Pandas`_ -~~~~~~~~~~~~~~ - -`Pint-Pandas `_ provides an extension type for -storing numeric arrays with units. These arrays can be stored inside pandas' -Series and DataFrame. Operations between Series and DataFrame columns which -use pint's extension array are then units aware. - -`Text Extensions for Pandas`_ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Text Extensions for Pandas `_ -provides extension types to cover common data structures for representing natural language -data, plus library integrations that convert the outputs of popular natural language -processing libraries into Pandas DataFrames. - -.. _ecosystem.accessors: - -Accessors ---------- - -A directory of projects providing -:ref:`extension accessors `. This is for users to -discover new accessors and for library authors to coordinate on the namespace. - -================== ============ ==================================== =============================================================================== -Library Accessor Classes Description -================== ============ ==================================== =============================================================================== -`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. -`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. -`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data. -`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. -`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. -`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. -`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. -`woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. -`staircase`_ ``sc`` ``Series`` Provides methods for querying, aggregating and plotting step functions -================== ============ ==================================== =============================================================================== - -.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest -.. _pdvega: https://altair-viz.github.io/pdvega/ -.. _Altair: https://altair-viz.github.io/ -.. _pandas-genomics: https://pandas-genomics.readthedocs.io/en/latest/ -.. _pandas_path: https://github.com/drivendataorg/pandas-path/ -.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html -.. _pint-pandas: https://github.com/hgrecco/pint-pandas -.. _composeml: https://github.com/alteryx/compose -.. _datatest: https://datatest.readthedocs.io/en/stable/ -.. _woodwork: https://github.com/alteryx/woodwork -.. _staircase: https://www.staircase.dev/ - -Development tools ------------------ - -`pandas-stubs `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -While pandas repository is partially typed, the package itself doesn't expose this information for external use. -Install pandas-stubs to enable basic type coverage of pandas API. - -Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`. - -See installation and usage instructions on the `github page `__. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index f91f4218c3429..d9d7d916b0238 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -21,10 +21,6 @@ libraries, we care about the following things: This page is also here to offer a bit of a translation guide for users of these R packages. -For transfer of ``DataFrame`` objects from pandas to R, one option is to -use HDF5 files, see :ref:`io.external_compatibility` for an -example. - Quick reference --------------- @@ -250,7 +246,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this: } ) - baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc="max") For more details and examples see :ref:`the reshaping documentation `. @@ -363,7 +359,7 @@ In pandas the equivalent expression, using the ) grouped = df.groupby(["month", "week"]) - grouped["x"].agg([np.mean, np.std]) + grouped["x"].agg(["mean", "std"]) For more details and examples see :ref:`the groupby documentation @@ -409,7 +405,7 @@ In Python, this list would be a list of tuples, so a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) -For more details and examples see :ref:`the Into to Data Structures +For more details and examples see :ref:`the Intro to Data Structures documentation `. meltdf @@ -486,7 +482,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: values="value", index=["variable", "week"], columns=["month"], - aggfunc=np.mean, + aggfunc="mean", ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 5a624c9c55782..595f3c85a9dc2 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -112,7 +112,7 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev/" + "https://raw.githubusercontent.com/pandas-dev/" "pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index a7148405ba8a0..d55b669d94a87 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -100,7 +100,7 @@ In pandas, you pass the URL or local path of the CSV file to :func:`~pandas.read .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" + "https://raw.githubusercontent.com/pandas-dev" "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) diff --git a/doc/source/getting_started/comparison/comparison_with_spss.rst b/doc/source/getting_started/comparison/comparison_with_spss.rst new file mode 100644 index 0000000000000..12c64bfd180a3 --- /dev/null +++ b/doc/source/getting_started/comparison/comparison_with_spss.rst @@ -0,0 +1,229 @@ +.. _compare_with_spss: + +{{ header }} + +Comparison with SPSS +******************** +For potential users coming from `SPSS `__, this page is meant to demonstrate +how various SPSS operations would be performed using pandas. + +.. include:: includes/introduction.rst + +Data structures +--------------- + +General terminology translation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "pandas", "SPSS" + :widths: 20, 20 + + :class:`DataFrame`, data file + column, variable + row, case + groupby, split file + :class:`NaN`, system-missing + +:class:`DataFrame` +~~~~~~~~~~~~~~~~~~ + +A :class:`DataFrame` in pandas is analogous to an SPSS data file - a two-dimensional +data source with labeled columns that can be of different types. As will be shown in this +document, almost any operation that can be performed in SPSS can also be accomplished in pandas. + +:class:`Series` +~~~~~~~~~~~~~~~ + +A :class:`Series` is the data structure that represents one column of a :class:`DataFrame`. SPSS doesn't have a +separate data structure for a single variable, but in general, working with a :class:`Series` is analogous +to working with a variable in SPSS. + +:class:`Index` +~~~~~~~~~~~~~~ + +Every :class:`DataFrame` and :class:`Series` has an :class:`Index` -- labels on the *rows* of the data. SPSS does not +have an exact analogue, as cases are simply numbered sequentially from 1. In pandas, if no index is +specified, a :class:`RangeIndex` is used by default (first row = 0, second row = 1, and so on). + +While using a labeled :class:`Index` or :class:`MultiIndex` can enable sophisticated analyses and is ultimately an +important part of pandas to understand, for this comparison we will essentially ignore the :class:`Index` and +just treat the :class:`DataFrame` as a collection of columns. Please see the :ref:`indexing documentation` +for much more on how to use an :class:`Index` effectively. + + +Copies vs. in place operations +------------------------------ + +.. include:: includes/copies.rst + + +Data input / output +------------------- + +Reading external data +~~~~~~~~~~~~~~~~~~~~~ + +Like SPSS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within +the pandas tests (`csv `_) +will be used in many of the following examples. + +In SPSS, you would use File > Open > Data to import a CSV file: + +.. code-block:: text + + FILE > OPEN > DATA + /TYPE=CSV + /FILE='tips.csv' + /DELIMITERS="," + /FIRSTCASE=2 + /VARIABLES=col1 col2 col3. + +The pandas equivalent would use :func:`read_csv`: + +.. code-block:: python + + url = ( + "https://raw.githubusercontent.com/pandas-dev" + "/pandas/main/pandas/tests/io/data/csv/tips.csv" + ) + tips = pd.read_csv(url) + tips + +Like SPSS's data import wizard, ``read_csv`` can take a number of parameters to specify how the data should be parsed. +For example, if the data was instead tab delimited, and did not have column names, the pandas command would be: + +.. code-block:: python + + tips = pd.read_csv("tips.csv", sep="\t", header=None) + + # alternatively, read_table is an alias to read_csv with tab delimiter + tips = pd.read_table("tips.csv", header=None) + + +Data operations +--------------- + +Filtering +~~~~~~~~~ + +In SPSS, filtering is done through Data > Select Cases: + +.. code-block:: text + + SELECT IF (total_bill > 10). + EXECUTE. + +In pandas, boolean indexing can be used: + +.. code-block:: python + + tips[tips["total_bill"] > 10] + + +Sorting +~~~~~~~ + +In SPSS, sorting is done through Data > Sort Cases: + +.. code-block:: text + + SORT CASES BY sex total_bill. + EXECUTE. + +In pandas, this would be written as: + +.. code-block:: python + + tips.sort_values(["sex", "total_bill"]) + + +String processing +----------------- + +Finding length of string +~~~~~~~~~~~~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE length = LENGTH(time). + EXECUTE. + +.. include:: includes/length.rst + + +Changing case +~~~~~~~~~~~~~ + +In SPSS: + +.. code-block:: text + + COMPUTE upper = UPCASE(time). + COMPUTE lower = LOWER(time). + EXECUTE. + +.. include:: includes/case.rst + + +Merging +------- + +In SPSS, merging data files is done through Data > Merge Files. + +.. include:: includes/merge_setup.rst +.. include:: includes/merge.rst + + +GroupBy operations +------------------ + +Split-file processing +~~~~~~~~~~~~~~~~~~~~~ + +In SPSS, split-file analysis is done through Data > Split File: + +.. code-block:: text + + SORT CASES BY sex. + SPLIT FILE BY sex. + DESCRIPTIVES VARIABLES=total_bill tip + /STATISTICS=MEAN STDDEV MIN MAX. + +The pandas equivalent would be: + +.. code-block:: python + + tips.groupby("sex")[["total_bill", "tip"]].agg(["mean", "std", "min", "max"]) + + +Missing data +------------ + +SPSS uses the period (``.``) for numeric missing values and blank spaces for string missing values. +pandas uses ``NaN`` (Not a Number) for numeric missing values and ``None`` or ``NaN`` for string +missing values. + +.. include:: includes/missing.rst + + +Other considerations +-------------------- + +Output management +----------------- + +While pandas does not have a direct equivalent to SPSS's Output Management System (OMS), you can +capture and export results in various ways: + +.. code-block:: python + + # Save summary statistics to CSV + tips.groupby('sex')[['total_bill', 'tip']].mean().to_csv('summary.csv') + + # Save multiple results to Excel sheets + with pd.ExcelWriter('results.xlsx') as writer: + tips.describe().to_excel(writer, sheet_name='Descriptives') + tips.groupby('sex').mean().to_excel(writer, sheet_name='Means by Gender') diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 0a891a4c6d2d7..dc0590f18751a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -17,7 +17,7 @@ structure. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" + "https://raw.githubusercontent.com/pandas-dev" "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) @@ -107,7 +107,7 @@ methods. .. ipython:: python frame = pd.DataFrame( - {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + {"col1": ["A", "B", np.nan, "C", "D"], "col2": ["F", np.nan, "G", "H", "I"]} ) frame @@ -164,16 +164,16 @@ The pandas equivalent would be: tips.groupby("sex").size() -Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not -:meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because -:meth:`~pandas.core.groupby.DataFrameGroupBy.count` applies the function to each column, returning +Notice that in the pandas code we used :meth:`.DataFrameGroupBy.size` and not +:meth:`.DataFrameGroupBy.count`. This is because +:meth:`.DataFrameGroupBy.count` applies the function to each column, returning the number of ``NOT NULL`` records within each. .. ipython:: python tips.groupby("sex").count() -Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method +Alternatively, we could have applied the :meth:`.DataFrameGroupBy.count` method to an individual column: .. ipython:: python @@ -181,7 +181,7 @@ to an individual column: tips.groupby("sex")["total_bill"].count() Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount -differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary +differs by day of the week - :meth:`.DataFrameGroupBy.agg` allows you to pass a dictionary to your grouped DataFrame, indicating which functions to apply to specific columns. .. code-block:: sql @@ -198,7 +198,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum .. ipython:: python - tips.groupby("day").agg({"tip": np.mean, "day": np.size}) + tips.groupby("day").agg({"tip": "mean", "day": "size"}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -222,7 +222,7 @@ Grouping by more than one column is done by passing a list of columns to the .. ipython:: python - tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]}) .. _compare_with_sql.join: @@ -505,7 +505,7 @@ DELETE DELETE FROM tips WHERE tip > 9; -In pandas we select the rows that should remain instead of deleting them: +In pandas we select the rows that should remain instead of deleting the rows that should be removed: .. ipython:: python diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 636778a2ca32e..b4b0c42d1db1d 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -108,7 +108,7 @@ the data set if presented with a url. .. ipython:: python url = ( - "https://raw.github.com/pandas-dev" + "https://raw.githubusercontent.com/pandas-dev" "/pandas/main/pandas/tests/io/data/csv/tips.csv" ) tips = pd.read_csv(url) diff --git a/doc/source/getting_started/comparison/includes/copies.rst b/doc/source/getting_started/comparison/includes/copies.rst index 08ccd47624932..4f49c3a1a762e 100644 --- a/doc/source/getting_started/comparison/includes/copies.rst +++ b/doc/source/getting_started/comparison/includes/copies.rst @@ -14,10 +14,15 @@ or overwrite the original one: .. note:: - You will see an ``inplace=True`` keyword argument available for some methods: + You will see an ``inplace=True`` or ``copy=False`` keyword argument available for + some methods: .. code-block:: python - df.sort_values("col1", inplace=True) + df.replace(5, inplace=True) - Its use is discouraged. :ref:`More information. ` + There is an active discussion about deprecating and removing ``inplace`` and ``copy`` for + most methods (e.g. ``dropna``) except for a very small subset of methods + (including ``replace``). Both keywords won't be + necessary anymore in the context of Copy-on-Write. The proposal can be found + `here `_. diff --git a/doc/source/getting_started/comparison/includes/missing.rst b/doc/source/getting_started/comparison/includes/missing.rst index 341c7d5498d82..ab5d90166e7b0 100644 --- a/doc/source/getting_started/comparison/includes/missing.rst +++ b/doc/source/getting_started/comparison/includes/missing.rst @@ -19,7 +19,7 @@ Forward fill from previous rows .. ipython:: python - outer_join.fillna(method="ffill") + outer_join.ffill() Replace missing values with a specified value ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/comparison/index.rst b/doc/source/getting_started/comparison/index.rst index c3f58ce1f3d6d..3133d74afa3db 100644 --- a/doc/source/getting_started/comparison/index.rst +++ b/doc/source/getting_started/comparison/index.rst @@ -14,3 +14,4 @@ Comparison with other tools comparison_with_spreadsheets comparison_with_sas comparison_with_stata + comparison_with_spss diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 4792d26d021d6..a17699a71fbd3 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -9,48 +9,52 @@ Getting started Installation ------------ -.. panels:: - :card: + install-card - :column: col-lg-6 col-md-6 col-sm-12 col-xs-12 p-3 +.. grid:: 1 2 2 2 + :gutter: 4 - Working with conda? - ^^^^^^^^^^^^^^^^^^^ + .. grid-item-card:: Working with conda? + :class-card: install-card + :columns: 12 12 6 6 + :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas can be installed via conda from `conda-forge `__. - ++++++++++++++++++++++ + ++++++++++++++++++++++ - .. code-block:: bash + .. code-block:: bash - conda install pandas + conda install -c conda-forge pandas - --- + .. grid-item-card:: Prefer pip? + :class-card: install-card + :columns: 12 12 6 6 + :padding: 3 - Prefer pip? - ^^^^^^^^^^^ + pandas can be installed via pip from `PyPI `__. - pandas can be installed via pip from `PyPI `__. + ++++ - ++++ + .. code-block:: bash - .. code-block:: bash + pip install pandas - pip install pandas + .. grid-item-card:: In-depth instructions? + :class-card: install-card + :columns: 12 + :padding: 3 - --- - :column: col-12 p-3 + Installing a specific version? Installing from source? Check the advanced + installation page. - In-depth instructions? - ^^^^^^^^^^^^^^^^^^^^^^ + +++ - Installing a specific version? Installing from source? Check the advanced - installation page. + .. button-ref:: install + :ref-type: ref + :click-parent: + :color: secondary + :expand: - .. link-button:: ./install.html - :type: url - :text: Learn more - :classes: btn-secondary stretched-link + Learn more .. _gentle_intro: @@ -64,7 +68,7 @@ Intro to pandas
-