diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index df787f80659d..000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,459 +0,0 @@ -version: 2.1 - -parameters: - pytorch_stable_image: - type: string - # https://hub.docker.com/r/pytorch/pytorch/tags - default: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime" - pytorch_stable_image_devel: - type: string - # https://hub.docker.com/r/pytorch/pytorch/tags - default: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel" - workingdir: - type: string - default: "/tmp/ignite" - should_build_docker_images: - type: boolean - default: false - should_publish_docker_images: - type: boolean - default: false - -# ------------------------------------------------------------------------------------- -# Environments to run the jobs in -# ------------------------------------------------------------------------------------- - -one_gpu: &one_gpu - machine: - # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images - image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1 - docker_layer_caching: true - # https://circleci.com/product/features/resource-classes/#linux-vm - resource_class: gpu.nvidia.small - -one_gpu_windows: &one_gpu_windows - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -two_gpus: &two_gpus - machine: - # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images - image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1 - docker_layer_caching: true - # https://circleci.com/product/features/resource-classes/#linux-vm - resource_class: gpu.nvidia.medium - -# ------------------------------------------------------------------------------------- -# Re-usable commands -# ------------------------------------------------------------------------------------- - -install_latest_nvidia: &install_latest_nvidia - - run: - name: Install latest NVidia-driver and CUDA - command: | - sudo apt-get purge nvidia* && sudo apt-get autoremove - sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-driver-470 - # Install nvidia-container-runtime - sudo apt-get install -y nvidia-container-runtime - # Reload driver : https://stackoverflow.com/a/45319156/6309199 - # lsof | grep nvidia -> kill Xvfb - sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {} || echo "Command 'sudo lsof ...' is failed" - # lsmod | grep nvidia - sudo rmmod nvidia_uvm && sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset && sudo rmmod nvidia - # reload driver - nvidia-smi - -pull_pytorch_stable_image: &pull_pytorch_stable_image - - run: - name: Pull PyTorch Stable Image - command: | - docker pull << pipeline.parameters.pytorch_stable_image >> - -pull_pytorch_stable_devel_image: &pull_pytorch_stable_devel_image - - run: - name: Pull PyTorch Stable Develop Image - command: | - docker pull << pipeline.parameters.pytorch_stable_image_devel >> - -run_pytorch_container: &run_pytorch_container - - run: - name: Start Pytorch container - environment: - wd: << pipeline.parameters.workingdir >> - command: | - docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >> - docker exec -it pthd nvidia-smi - docker exec -it pthd ls - -run_pytorch_devel_container: &run_pytorch_devel_container - - run: - name: Start Pytorch dev container - environment: - wd: << pipeline.parameters.workingdir >> - command: | - docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >> - docker exec -it pthd nvidia-smi - docker exec -it pthd ls - -install_dependencies: &install_dependencies - - run: - name: Install dependencies - command: | - docker exec -it pthd pip install -r requirements-dev.txt - # Commented APEX installation, https://github.com/pytorch/ignite/issues/2299 - # export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex' - # export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}" - # docker exec -it pthd /bin/bash -c "$install_git_apex_cmd" - export install_ignite_cmd='python setup.py install' - docker exec -it pthd /bin/bash -c "$install_ignite_cmd" - -# https://github.com/pytorch/ignite/issues/1737 -download_mnist: &download_mnist - - run: - name: Download MNIST - command: | - export install_git_cmd="apt-get update && apt-get install -y --no-install-recommends git" - docker exec -it pthd /bin/bash -c "$install_git_cmd" - - export tmp_mnist_dir='/tmp/mnist' - export tests_mnist_dir='/tmp' - export examples_mnist_dir='.' - export download_mnist_cmd="git clone https://github.com/pytorch-ignite/download-mnist-github-action.git $tmp_mnist_dir" - docker exec -it pthd /bin/bash -c "$download_mnist_cmd" - export get_mnist_cmd_tests="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $tests_mnist_dir/MNIST/raw" - docker exec -it pthd /bin/bash -c "$get_mnist_cmd_tests" - export get_mnist_cmd_examples="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $examples_mnist_dir/MNIST/raw" - docker exec -it pthd /bin/bash -c "$get_mnist_cmd_examples" - -# ------------------------------------------------------------------------------------- -# Jobs to run -# ------------------------------------------------------------------------------------- -jobs: - one_gpu_tests: - <<: *one_gpu - - working_directory: << pipeline.parameters.workingdir >> - - steps: - - checkout - - run: - name: Trigger job if modified - command: | - bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - - <<: *pull_pytorch_stable_image - - <<: *run_pytorch_container - - <<: *install_dependencies - - <<: *download_mnist - - run: - name: Run GPU Unit Tests and Examples - command: | - - # pytest on cuda - export test_cmd='bash tests/run_gpu_tests.sh' - docker exec -it pthd /bin/bash -c "${test_cmd}" - - # MNIST tests - - # 1) mnist.py - export minst1_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist.py --epochs=1' - docker exec -it pthd /bin/bash -c "$minst1_cmd" - - # 2) mnist_with_visdom.py - export visdom_script_cmd='python -c "from visdom.server.build import download_scripts; download_scripts()"' - export visdom_cmd='python -m visdom.server' - docker exec -d pthd /bin/bash -c "$visdom_script_cmd && $visdom_cmd" - export sleep_cmd='sleep 10' - export mnist2_cmd='python examples/mnist/mnist_with_visdom.py --epochs=1' - docker exec -it pthd /bin/bash -c "$sleep_cmd && $mnist2_cmd" - - # 3.1) mnist_with_tensorboard.py with tbX - export mnist3_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_with_tensorboard.py --epochs=1' - docker exec -it pthd /bin/bash -c "$mnist3_cmd" - - # uninstall tensorboardX - export pip_cmd='pip uninstall -y tensorboardX' - docker exec -it pthd /bin/bash -c "$pip_cmd" - - # 3.2) mnist_with_tensorboard.py with native torch tb - docker exec -it pthd /bin/bash -c "$mnist3_cmd" - - # 4) mnist_save_resume_engine.py - # save - export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100' - docker exec -it pthd /bin/bash -c "$mnist4_cmd" - # resume - export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt' - docker exec -it pthd /bin/bash -c "$mnist4_cmd" - - one_gpu_windows_tests: - <<: *one_gpu_windows - - working_directory: << pipeline.parameters.workingdir >> - - steps: - - checkout - - run: - name: Trigger job if modified - command: | - bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - - # - run: - # name: Update CUDA Driver for Windows - # command: | - # curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh - # mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/" - # JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh - # bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'" - - - run: - name: Install dependencies - command: | - conda --version - # We have to use cuda 10.2 on Windows: - # https://github.com/pytorch/ignite/issues/1843 - conda install -y pytorch==1.9.1 torchvision cudatoolkit=10.2 -c pytorch - pip install -r requirements-dev.txt - pip install . - python -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())" - python -c "import torch; torch.cuda.is_available()" - - - run: - # https://github.com/pytorch/ignite/issues/1737 - name: Download MNIST - command: | - git clone https://github.com/pytorch-ignite/download-mnist-github-action.git /tmp/mnist - python /tmp/mnist/cp.py /tmp/mnist /tmp/MNIST/raw - - - run: - name: Run GPU Unit Tests - command: | - # pytest on cuda - SKIP_DISTRIB_TESTS=1 bash tests/run_gpu_tests.sh - - two_gpus_tests: - <<: *two_gpus - - working_directory: << pipeline.parameters.workingdir >> - - steps: - - checkout - - run: - name: Trigger job if modified - command: | - bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - - <<: *pull_pytorch_stable_image - - <<: *run_pytorch_container - - <<: *install_dependencies - - <<: *download_mnist - - run: - name: Run 1 Node 2 GPUs Unit Tests - command: | - export test_cmd='bash tests/run_gpu_tests.sh 2' - docker exec -it pthd /bin/bash -c "${test_cmd}" - - two_gpus_check_dist_cifar10_example: - <<: *two_gpus - - working_directory: << pipeline.parameters.workingdir >> - - steps: - - checkout - - run: - name: Trigger job if modified - command: | - bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - - <<: *pull_pytorch_stable_image - - <<: *run_pytorch_container - - <<: *install_dependencies - - run: - name: "Install additional example dependencies" - command: | - docker exec -it pthd pip install fire - - run: - name: "Run without backend" - command: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200" - docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" - docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - - - run: - name: "Run with NCCL backend using torchrun" - command: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200" - docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" - docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - - - run: - name: "Run with NCCL backend using spawn" - command: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200" - docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" - docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - - two_gpus_hvd_tests: - <<: *two_gpus - - working_directory: << pipeline.parameters.workingdir >> - - steps: - - checkout - - run: - name: Trigger job if modified - command: | - bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - - <<: *pull_pytorch_stable_devel_image - - <<: *run_pytorch_devel_container - - <<: *install_dependencies - - <<: *download_mnist - - run: - name: "Install Horovod with NCCL GPU ops" - command: | - - # Following https://github.com/horovod/horovod/blob/master/Dockerfile.test.gpu - # and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778 - docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git" - docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git -b v0.23.0 /horovod && cd /horovod && python setup.py sdist" - docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.11 -c conda-forge" - docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig' - docker exec -it pthd horovodrun --check-build - - - run: - name: Run 1 Node 2 GPUs Unit Tests - command: | - export test_cmd='bash tests/run_gpu_tests.sh 2 hvd' - docker exec -it pthd /bin/bash -c "${test_cmd}" - # no CUDA devices Horovod tests - export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd' - docker exec -it pthd /bin/bash -c "${test_cmd}" - - - run: - name: "Check CIFAR10 using horovodrun" - command: | - docker exec -it pthd pip install fire - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200" - docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" - docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - - - run: - name: "Check CIFAR10 using spawn" - command: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200" - docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}" - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt" - docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}" - - build_publish_docker_images: - # https://circleci.com/docs/2.0/building-docker-images/ - docker: - - image: cimg/python:3.8.8 - - # https://circleci.com/docs/2.0/executor-types/#available-docker-resource-classes - resource_class: 2xlarge - - working_directory: << pipeline.parameters.workingdir >> - steps: - - checkout - - setup_remote_docker: - version: 19.03.14 - docker_layer_caching: true - - run: - name: Install deps - command: | - pip --version - pip install docker - - run: - name: Build all Horovod flavoured PyTorch-Ignite images - command: | - cd docker - export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` - export HVD_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_hvd_version'))"` - bash build.sh hvd hvd-base - bash build.sh hvd hvd-vision - bash build.sh hvd hvd-nlp - bash build.sh hvd hvd-apex - bash build.sh hvd hvd-apex-vision - bash build.sh hvd hvd-apex-nlp - - - run: - name: Build all PyTorch-Ignite images - command: | - cd docker - export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` - bash build.sh main base - bash build.sh main vision - bash build.sh main nlp - bash build.sh main apex - bash build.sh main apex-vision - bash build.sh main apex-nlp - - - run: - name: Build all MS DeepSpeed flavoured PyTorch-Ignite images - command: | - cd docker - export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"` - export MSDP_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_msdp_version'))"` - bash build.sh msdp msdp-apex - bash build.sh msdp msdp-apex-vision - bash build.sh msdp msdp-apex-nlp - - - run: - name: List built images - command: docker images | grep pytorchignite - - - when: - condition: << pipeline.parameters.should_publish_docker_images >> - steps: - - run: - name: Push all PyTorch-Ignite Docker images - command: | - cd docker - sh ./push_all.sh - -# ------------------------------------------------------------------------------------- -# Workflows -# ------------------------------------------------------------------------------------- -workflows: - version: 2 - gpu_tests: - unless: << pipeline.parameters.should_build_docker_images >> - jobs: - - one_gpu_tests - # Disabled windows tests as NVidia driver is too old - # > c:\tools\miniconda3\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ..\c10\cuda\CUDAFunctions.cpp:115.) - # > return torch._C._cuda_getDeviceCount() > 0 - # - one_gpu_windows_tests - # Can not run tests on 2 GPUs on Circle-CI - # Now, they are running on GHA self-hosted - # - two_gpus_tests - # - two_gpus_check_dist_cifar10_example - # - two_gpus_hvd_tests - docker_images: - when: << pipeline.parameters.should_build_docker_images >> - jobs: - - build_publish_docker_images diff --git a/.circleci/trigger_if_modified.sh b/.circleci/trigger_if_modified.sh deleted file mode 100644 index 311e00a5991e..000000000000 --- a/.circleci/trigger_if_modified.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# Script is taken from https://circleci.com/developer/orbs/orb/roopakv/swissknife#commands-run_if_modified -# Usage: sh trigger_if_modified.sh [base-branch] -# - for example: sh trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*" - -if [ -z "$1" ]; then - echo "Pattern should be provided. Usage: sh trigger_if_modified.sh " - exit 1 -fi - -pattern=$1 - -if [ -z "$2" ]; then - base_branch=master -else - base_branch=$2 -fi - -echo "- Pattern: ${pattern}" -echo "- Base branch: ${base_branch}" - -if [ -z "$BASH" ]; then - echo Bash not installed. - exit 1 -fi - -git status >/dev/null 2>&1 || { echo >&2 "Not in a git directory or no git"; exit 1; } - -circleci-agent >/dev/null 2>&1 || { echo >&2 "No Circle CI agent. These are in all Circle CI containers"; exit 1; } - - -if [ "$CIRCLE_BRANCH" == "master" ]; then - echo "Skip checking modified files if on master" - exit 0 -fi - -FILES_MODIFIED="" - -setcommit () { - FILES_MODIFIED=$(git diff --name-only origin/${base_branch}..HEAD | grep -i -E ${pattern}) -} - -setcommit || true - -if [ -z "$FILES_MODIFIED" ]; then - echo "Files not modified. Halting job" - circleci-agent step halt -else - echo "Files modified: ${FILES_MODIFIED}, continuing steps" -fi \ No newline at end of file diff --git a/.github/pr-labeler-config.yml b/.github/pr-labeler-config.yml index a02ec1183346..393adb66f756 100644 --- a/.github/pr-labeler-config.yml +++ b/.github/pr-labeler-config.yml @@ -6,9 +6,8 @@ docker: docs: - docs/**/* -# Add 'ci' to any changes within 'circleci' and '.github' folder +# Add 'ci' to any changes in '.github' folder ci: - - .circleci/**/* - .github/**/* # Add 'examples' to any changes within 'examples' folder diff --git a/.github/workflows/binaries-nightly-release.yml b/.github/workflows/binaries-nightly-release.yml index bd19857c345e..0c1bce941709 100644 --- a/.github/workflows/binaries-nightly-release.yml +++ b/.github/workflows/binaries-nightly-release.yml @@ -10,13 +10,13 @@ jobs: build-publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2 with: miniconda-version: "latest" - python-version: 3.8 + python-version: "3.10" - name: Setup nightly version run: | diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index a8dce7e63e24..8627c0ece64f 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -10,7 +10,6 @@ on: - "tests/run_code_style.sh" - ".github/workflows/code-style.yml" - "!assets/**" - - "!.circleci/**" - "!docker/**" - "!docs/**" - "!conda.recipe" @@ -21,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - if: github.event_name == 'push' - uses: actions/checkout@v3 + uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.8" diff --git a/.github/workflows/discord_issues.yml b/.github/workflows/discord_issues.yml new file mode 100644 index 000000000000..db79dd44acb7 --- /dev/null +++ b/.github/workflows/discord_issues.yml @@ -0,0 +1,30 @@ +name: Discuss "help-wanted" issue on Discord + +on: + issues: + types: + - labeled + workflow_dispatch: + inputs: + issue_number: + description: 'Issue number' + required: true + +permissions: + issues: write + +jobs: + discord: + runs-on: ubuntu-latest + steps: + - name: "Discuss on Discord-Issues" + if: ${{ github.event.label.name == 'help wanted' }} + uses: EndBug/discuss-on-discord@v1.1.0 + with: + discord_bot_token: ${{ secrets.DISCORD_BOT_TOKEN }} + destination: ${{ secrets.DISCORD_BOT_DESTINATION }} + issue_number: ${{ github.event.inputs.issue_number || github.event.issue.number }} + issue_comment: Hey 👋, I've just created a [thread]($THREAD_LINK$) for this issue on [PyTorch-Ignite Discord](https://pytorch-ignite.ai/chat) where you can quickly talk to the community on the topic. + discord_message: New issue created in `${{ github.repository }}`: + + diff --git a/.github/workflows/discord_pull_requests.yaml b/.github/workflows/discord_pull_requests.yaml new file mode 100644 index 000000000000..121aa581aa48 --- /dev/null +++ b/.github/workflows/discord_pull_requests.yaml @@ -0,0 +1,29 @@ +name: Discuss "help-wanted" PR on Discord + +on: + pull_request: + types: + - labeled + workflow_dispatch: + inputs: + pull_request_number: + description: 'Pull request number' + required: true + +permissions: + pull-requests: write + +jobs: + discord: + runs-on: ubuntu-latest + steps: + - name: "Discuss on Discord-PR (Non-maintainer only)" + if: ${{ github.event.label.name == 'help wanted' }} + uses: EndBug/discuss-on-discord@v1.1.0 + with: + discord_bot_token: ${{ secrets.DISCORD_BOT_TOKEN }} + destination: ${{ secrets.DISCORD_BOT_DESTINATION }} + issue_number: ${{ github.event.inputs.pull_request_number || github.event.pull_request.number }} + issue_comment: Hey 👋, I've just created a [thread]($THREAD_LINK$) for this pull request on [PyTorch-Ignite Discord](https://pytorch-ignite.ai/chat) where you can quickly talk to the community on the topic. + discord_message: New PR created in `${{ github.repository }}`: + diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 6344d49a751a..37bb2333c81c 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -16,7 +16,7 @@ jobs: hvd_version: ${{ steps.set-versions.outputs.hvd_version }} msdp_version: ${{ steps.set-versions.outputs.msdp_version }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Changed Files Exporter id: files uses: futuratrepadeira/changed-files@v3.3.0 @@ -41,14 +41,14 @@ jobs: if: contains(needs.setup.outputs.modified, 'hvd/') || contains(needs.setup.outputs.modified, 'docker.cfg') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Remove cache run: | sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Build hvd hvd-base working-directory: docker run: | @@ -79,14 +79,14 @@ jobs: if: contains(needs.setup.outputs.modified, 'hvd/') || contains(needs.setup.outputs.modified, 'docker.cfg') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Remove cache run: | sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Build hvd hvd-apex working-directory: docker run: | @@ -117,14 +117,14 @@ jobs: if: contains(needs.setup.outputs.modified, 'main/') || contains(needs.setup.outputs.modified, 'docker.cfg') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Remove cache run: | sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Build main base working-directory: docker run: | @@ -152,14 +152,14 @@ jobs: if: contains(needs.setup.outputs.modified, 'main/') || contains(needs.setup.outputs.modified, 'docker.cfg') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Remove cache run: | sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Build main apex working-directory: docker run: | @@ -187,14 +187,14 @@ jobs: if: contains(needs.setup.outputs.modified, 'msdp/') || contains(needs.setup.outputs.modified, 'docker.cfg') runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Remove cache run: | sudo rm -rf "/usr/local/share/boost" sudo rm -rf "$AGENT_TOOLSDIRECTORY" - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Build msdp msdp-apex working-directory: docker run: | diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index e472dbbed1cf..fa5375c0ae29 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -14,17 +14,8 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.8 - - name: Trigger Circle-CI pipeline - env: - CIRCLE_TOKEN: ${{ secrets.CIRCLE_TOKEN }} - run: | - pip install requests - - if [ $GITHUB_EVENT_NAME == 'pull_request' ]; then should_publish_docker_images=false; else should_publish_docker_images=true; fi - branch=$GITHUB_REF - - python -u .github/workflows/trigger_circle_ci.py $should_publish_docker_images $branch + python-version: "3.10" + # TODO diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a12c4f72eef0..9fd907b5c158 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -17,10 +17,10 @@ jobs: if: (github.ref == 'refs/heads/master' && github.event_name == 'push') || github.event_name == 'release' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - run: sudo npm install katex -g - uses: actions/cache@v3 @@ -48,10 +48,10 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 10 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - uses: actions/cache@v3 with: @@ -69,10 +69,10 @@ jobs: if: github.event_name == 'pull_request' || github.event_name == 'push' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - run: sudo npm install katex -g - uses: actions/cache@v3 diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml new file mode 100644 index 000000000000..6661f46b501b --- /dev/null +++ b/.github/workflows/gpu-hvd-tests.yml @@ -0,0 +1,198 @@ +name: Run HVD-specific unit tests on GPUs +on: + push: + paths: + - "ignite/**" + - "tests/ignite/**" + - "tests/run_gpu_tests.sh" + - "tests/run_code_style.sh" + - "examples/**.py" + - "requirements-dev.txt" + - ".github/workflows/gpu-hvd-tests.yml" + workflow_dispatch: + +concurrency: + # -- + group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} + cancel-in-progress: true + +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml + +jobs: + gpu-hvd-tests: + strategy: + matrix: + pytorch-channel: [pytorch, ] + fail-fast: false + env: + DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1" + REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + runs-on: linux.8xlarge.nvidia.gpu + timeout-minutes: 60 + + steps: + - name: Clean workspace + run: | + echo "::group::Cleanup debug output" + sudo rm -rfv "${GITHUB_WORKSPACE}" + mkdir -p "${GITHUB_WORKSPACE}" + echo "::endgroup::" + + - name: Checkout repository (pytorch/test-infra) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: pytorch/test-infra + path: test-infra + + - name: Setup Linux + uses: ./test-infra/.github/actions/setup-linux + + - name: Pull docker image + uses: ./test-infra/.github/actions/pull-docker-image + with: + docker-image: ${{ env.DOCKER_IMAGE }} + + - name: Checkout repository (${{ github.repository }}) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: ${{ github.repository }} + ref: ${{ github.ref }} + path: ${{ github.repository }} + fetch-depth: 1 + + - name: Start Pytorch container + working-directory: ${{ github.repository }} + run: | + docker run --name pthd --gpus=all --rm \ + --cap-add=SYS_PTRACE \ + --detach \ + --ipc=host \ + --security-opt seccomp=unconfined \ + --shm-size=2g \ + --tty \ + --ulimit stack=10485760:83886080 \ + -v $PWD:/work \ + -w /work \ + ${DOCKER_IMAGE} + + script=$(cat << EOF + + set -xe + + nvidia-smi + ls -alh + + conda --version + python --version + + EOF + ) + docker exec -t pthd /bin/bash -c "${script}" + + - name: Install PyTorch and dependencies + continue-on-error: false + run: | + + script=$(cat << EOF + + set -xe + + # Install PyTorch + if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121 + else + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 + fi + + python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" + pip list + + # Install dependencies + pip install -r requirements-dev.txt + pip install -e . + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Install Horovod with NCCL GPU ops + run: | + script=$(cat << EOF + + set -xe + + # Can't build Horovod with recent pytorch due to pytorch required C++17 standard + # and horovod is still using C++14 + # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch] + # Using a similar hack as described here: + # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 + git clone --recursive https://github.com/horovod/horovod.git /horovod + cd /horovod + sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt + sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt + HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 python setup.py install + + horovodrun --check-build + pip list + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Run GPU and CPU Unit HVD Tests + run: | + + script=$(cat << EOF + + set -xe + + bash tests/run_gpu_tests.sh 2 hvd + CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + file: ${{ github.repository }}/coverage.xml + flags: gpu-2 + fail_ci_if_error: false + + - name: Run examples in container + continue-on-error: false + run: | + SCRIPT=$(cat << EOF + + set -xe + + # Install additional example dependencies + pip install fire + + # Check training on CIFAR10, run with horovod backend using horovodrun + # initial run + CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500 + # resume + CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt + + # Check training on CIFAR10 using spawn + # initial run + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 + # resume + CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Teardown Linux + if: ${{ always() }} + uses: ./test-infra/.github/actions/teardown-linux diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index d5a6e8b0bd57..92345b3baed3 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -16,95 +16,164 @@ concurrency: group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} cancel-in-progress: true +# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml + jobs: gpu-tests: - runs-on: [self-hosted, 2-gpus] - timeout-minutes: 45 - defaults: - run: - shell: bash strategy: - max-parallel: 1 - fail-fast: true matrix: pytorch-channel: [pytorch, pytorch-nightly] + fail-fast: false env: - AGENT_TOOLSDIRECTORY: /tmp/python + DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1" + REPOSITORY: ${{ github.repository }} + PR_NUMBER: ${{ github.event.pull_request.number }} + runs-on: linux.8xlarge.nvidia.gpu + timeout-minutes: 45 steps: - - uses: actions/checkout@v3 - - - name: Clean python tool path + - name: Clean workspace run: | - rm -rf ${AGENT_TOOLSDIRECTORY} + echo "::group::Cleanup debug output" + sudo rm -rfv "${GITHUB_WORKSPACE}" + mkdir -p "${GITHUB_WORKSPACE}" + echo "::endgroup::" + + - name: Checkout repository (pytorch/test-infra) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: pytorch/test-infra + path: test-infra + + - name: Setup Linux + uses: ./test-infra/.github/actions/setup-linux - - uses: actions/setup-python@v4 + - name: Pull docker image + uses: ./test-infra/.github/actions/pull-docker-image with: - python-version: 3.9 + docker-image: ${{ env.DOCKER_IMAGE }} - - name: Install PyTorch - # https://pytorch.org/get-started/locally/ - if: ${{ matrix.pytorch-channel == 'pytorch' }} + - name: Checkout repository (${{ github.repository }}) + uses: actions/checkout@v3 + with: + # Support the use case where we need to checkout someone's fork + repository: ${{ github.repository }} + ref: ${{ github.ref }} + path: ${{ github.repository }} + fetch-depth: 1 + + - name: Start Pytorch container + working-directory: ${{ github.repository }} run: | - pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117 - nvidia-smi - python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" - pip list + docker run --name pthd --gpus=all --rm \ + --cap-add=SYS_PTRACE \ + --detach \ + --ipc=host \ + --security-opt seccomp=unconfined \ + --shm-size=2g \ + --tty \ + --ulimit stack=10485760:83886080 \ + -v $PWD:/work \ + -w /work \ + ${DOCKER_IMAGE} + + script=$(cat << EOF + + set -xe + + nvidia-smi + ls -alh + + conda --version + python --version - - name: Install PyTorch (nightly) - # https://pytorch.org/get-started/locally/ - if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }} + EOF + ) + docker exec -t pthd /bin/bash -c "${script}" + + - name: Install PyTorch and dependencies + continue-on-error: false run: | - pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117 - nvidia-smi + + script=$(cat << EOF + + set -xe + + # Install PyTorch + if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then + pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121 + else + pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 + fi + python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" pip list - - name: Install dependencies - run: | + # Install dependencies pip install -r requirements-dev.txt pip install -e . - - name: Run 1 Node 2 GPUs Unit Tests + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Run GPU Unit Tests + continue-on-error: false run: | + + script=$(cat << EOF + + set -xe + bash tests/run_gpu_tests.sh 2 + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: - file: ./coverage.xml + file: ${{ github.repository }}/coverage.xml flags: gpu-2 fail_ci_if_error: false - - name: Install additional example dependencies - run: pip install fire - - - name: Check training on cifar10, run without backend - run: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - CI=1 python ${example_path}/main.py run --checkpoint_every=200 ${stop_cmd} - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt" - CI=1 python ${example_path}/main.py run --checkpoint_every=200 --num_epochs=7 ${resume_opt} - - - name: Check training on cifar10, run with NCCL backend using torchrun + - name: Run examples in container + continue-on-error: false run: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 ${stop_cmd} - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" - CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 ${resume_opt} - - - name: Check training on cifar10, run with NCCL backend using spawn - run: | - export example_path="examples/contrib/cifar10" - # initial run - export stop_cmd="--stop_iteration=500" - CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 ${stop_cmd} - # resume - export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt" - CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 ${resume_opt} + SCRIPT=$(cat << EOF + + set -xe + + # Install additional example dependencies + pip install fire + + # Check training on cifar10, run without backend + ## initial run + CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500 + ## resume + CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt + + # Check training on cifar10, run with NCCL backend using torchrun + ## initial run + CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500 + ## resume + CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt + + # Check training on cifar10, run with NCCL backend using spawn + ## initial run + CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 + ## resume + CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt + + EOF + ) + + docker exec -t pthd /bin/bash -c "${script}" + + - name: Teardown Linux + if: ${{ always() }} + uses: ./test-infra/.github/actions/teardown-linux diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml index 628ccfce3230..f483d21f38ee 100644 --- a/.github/workflows/hvd-tests.yml +++ b/.github/workflows/hvd-tests.yml @@ -32,7 +32,7 @@ jobs: pytorch-channel: [pytorch] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Get year & week number id: get-date diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index 1dbf6df7786f..1b8672c0a47e 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -10,52 +10,48 @@ on: jobs: build: runs-on: ubuntu-latest + timeout-minutes: 45 strategy: max-parallel: 10 fail-fast: false matrix: python-version: [3.8, 3.9, "3.10"] pytorch-version: - [1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0] - exclude: - - pytorch-version: 1.4.0 - python-version: 3.9 - - pytorch-version: 1.4.0 - python-version: 3.10 - + [2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1] + exclude: - pytorch-version: 1.5.1 python-version: 3.9 - pytorch-version: 1.5.1 - python-version: 3.10 + python-version: "3.10" - pytorch-version: 1.6.0 python-version: 3.9 - pytorch-version: 1.6.0 - python-version: 3.10 + python-version: "3.10" # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail. # https://github.com/pytorch/ignite/issues/2383 - pytorch-version: 1.7.1 python-version: 3.9 - pytorch-version: 1.7.1 - python-version: 3.10 + python-version: "3.10" - pytorch-version: 1.8.1 python-version: 3.9 - pytorch-version: 1.8.1 - python-version: 3.10 + python-version: "3.10" - pytorch-version: 1.9.1 - python-version: 3.10 + python-version: "3.10" - pytorch-version: 1.10.0 - python-version: 3.10 + python-version: "3.10" - pytorch-version: 1.11.0 - python-version: 3.10 + python-version: "3.10" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Get year & week number id: get-date @@ -87,22 +83,18 @@ jobs: - name: Install dependencies shell: bash -l {0} - if: ${{ matrix.pytorch-version != '1.4.0' }} run: | conda install pytorch=${{ matrix.pytorch-version }} torchvision cpuonly python=${{ matrix.python-version }} -c pytorch pip install -r requirements-dev.txt python setup.py install - # There is no more torchvision 0.5.0 binaries in anaconda pytorch channel: - # https://anaconda.org/pytorch/torchvision/files - - name: Install appropriate dependencies for PyTorch 1.4.0 - shell: bash -l {0} - if: ${{ matrix.pytorch-version == '1.4.0' }} - run: | - conda install pytorch=${{ matrix.pytorch-version }} cpuonly python=${{ matrix.python-version }} -c pytorch - pip install torchvision==0.5.0 - pip install -r requirements-dev.txt - python setup.py install + # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern + # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59 + bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])") + if [ "${bad_pth_version}" == "True" ]; then + pip install --upgrade "setuptools<59" + python -c "from setuptools import distutils; distutils.version.LooseVersion" + fi - name: Download MNIST uses: pytorch-ignite/download-mnist-github-action@master @@ -120,7 +112,7 @@ jobs: needs: build if: always() && needs.build.result == 'failure' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: JasonEtco/create-an-issue@v2 name: Create issue if pytorch version tests failed with: diff --git a/.github/workflows/stable-release-anaconda.yml b/.github/workflows/stable-release-anaconda.yml index a3299acf9ca4..817c7f59cca1 100644 --- a/.github/workflows/stable-release-anaconda.yml +++ b/.github/workflows/stable-release-anaconda.yml @@ -8,13 +8,13 @@ jobs: conda-build-publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2 with: miniconda-version: "latest" - python-version: 3.8 + python-version: "3.10" - name: Install dependencies shell: bash -l {0} diff --git a/.github/workflows/stable-release-pypi.yml b/.github/workflows/stable-release-pypi.yml index 974d4fac6cc8..6f37eca6bde5 100644 --- a/.github/workflows/stable-release-pypi.yml +++ b/.github/workflows/stable-release-pypi.yml @@ -8,13 +8,13 @@ jobs: build-publish: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Miniconda uses: conda-incubator/setup-miniconda@v2 with: miniconda-version: "latest" - python-version: 3.8 + python-version: "3.10" - name: Install dependencies shell: bash -l {0} @@ -31,23 +31,13 @@ jobs: twine check dist/* TWINE_USERNAME="${{ secrets.PYPI_USER }}" TWINE_PASSWORD="${{ secrets.PYPI_TOKEN }}" twine upload --verbose dist/* - docker-build-publish: - name: Trigger Build and Push Docker images to Docker Hub - needs: build-publish - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - name: Trigger Circle-CI pipeline - env: - CIRCLE_TOKEN: ${{ secrets.CIRCLE_TOKEN }} - run: | - pip install requests - - if [ $GITHUB_EVENT_NAME == 'pull_request' ]; then should_publish_docker_images=false; else should_publish_docker_images=true; fi - branch=$GITHUB_REF + # docker-build-publish: + # name: Trigger Build and Push Docker images to Docker Hub + # needs: build-publish + # runs-on: ubuntu-latest - python -u .github/workflows/trigger_circle_ci.py $should_publish_docker_images $branch + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v4 + # with: + # python-version: "3.10" diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml index 221f7f191cf0..6eb9397a772d 100644 --- a/.github/workflows/tpu-tests.yml +++ b/.github/workflows/tpu-tests.yml @@ -31,7 +31,7 @@ jobs: xla-version: [nightly] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python 3.8 uses: actions/setup-python@v4 with: diff --git a/.github/workflows/trigger_circle_ci.py b/.github/workflows/trigger_circle_ci.py deleted file mode 100644 index ff8ce3dddd86..000000000000 --- a/.github/workflows/trigger_circle_ci.py +++ /dev/null @@ -1,122 +0,0 @@ -import json -import os -import sys -import time - -import requests - - -def assert_result(result, expected_code): - if result.status_code != expected_code: - raise RuntimeError(f"{result.url}, {result.status_code}: {result.text}") - - -def get_output(result_text, required_keys): - output = json.loads(result_text) - - if not all([v in output for v in required_keys]): - raise RuntimeError(f"Output does not contain required fields: {required_keys}\n" f"Output is: {output}") - return output - - -def trigger_new_pipeline(data, headers): - result = requests.post( - "https://circleci.com/api/v2/project/gh/pytorch/ignite/pipeline", data=json.dumps(data), headers=headers - ) - assert_result(result, 201) - output = get_output(result.text, ["id"]) - return output["id"] - - -def assert_pipeline_created(pipeline_id, headers): - while True: - result = requests.get(f"https://circleci.com/api/v2/pipeline/{pipeline_id}", headers=headers) - assert_result(result, 200) - output = get_output(result.text, ["state", "errors"]) - - if output["state"] == "errored": - raise RuntimeError(f"Pipeline is errored: {output['errors']}") - if output["state"] == "created": - break - time.sleep(2) - - -def get_workflow_id(pipeline_id, headers): - - while True: - result = requests.get(f"https://circleci.com/api/v2/pipeline/{pipeline_id}/workflow", headers=headers) - assert_result(result, 200) - output = get_output(result.text, ["items"]) - items = output["items"] - if len(items) > 1: - raise RuntimeError(f"Incorrect number of workflow ids: {len(items)} != 1\n" f"items: {items}") - if len(items) < 1: - continue - item_0 = items[0] - if "id" not in item_0: - raise RuntimeError("Workflow info does not contain 'id'\n" f"Info: {item_0}") - return item_0["id"] - - -def assert_workflows_successful(pipeline_id, headers): - - workflow_id = get_workflow_id(pipeline_id, headers) - - base_url = "https://app.circleci.com/pipelines/github/pytorch/ignite" - url = None - - while True: - result = requests.get(f"https://circleci.com/api/v2/workflow/{workflow_id}", headers=headers) - assert_result(result, 200) - output = get_output(result.text, ["name", "status", "pipeline_number"]) - - if url is None: - url = f"{base_url}/{output['pipeline_number']}/workflows/{workflow_id}" - print(f"Circle CI workflow: {url}") - - if output["status"] in ["error", "failing", "canceled", "not_run", "failed"]: - raise RuntimeError(f"Workflow failed: {output['status']}\n" f"See {url}") - if output["status"] == "success": - print("\nWorkflow successful") - break - time.sleep(30) - print(".", end=" ") - - -if __name__ == "__main__": - - print("Trigger new pipeline on Circle-CI") - - if "CIRCLE_TOKEN" not in os.environ: - raise RuntimeError( - "Can not find CIRCLE_TOKEN env variable.\nPlease, export CIRCLE_TOKEN= before calling this script." - "This token should be a user token and not the project token." - ) - # https://discuss.circleci.com/t/triggering-pipeline-via-v2-api-fails-with-404-project-not-found/39342/2 - - argv = sys.argv - if len(argv) != 3: - raise RuntimeError("Usage: python trigger_circle_ci.py ") - - should_publish_docker_images = json.loads(argv[1]) - branch = argv[2] - - print(f"- should_publish_docker_images: {should_publish_docker_images}") - print(f"- Branch: {branch}") - if branch.startswith("refs/pull") and branch.endswith("/merge"): - branch = branch.replace("/merge", "/head") - print(f"Replaced /merge -> /head : {branch}") - - headers = {"authorization": "Basic", "content-type": "application/json", "Circle-Token": os.environ["CIRCLE_TOKEN"]} - - data = { - "branch": branch, - "parameters": { - "should_build_docker_images": True, - "should_publish_docker_images": should_publish_docker_images, - }, - } - - unique_pipeline_id = trigger_new_pipeline(data, headers) - assert_pipeline_created(unique_pipeline_id, headers) - assert_workflows_successful(unique_pipeline_id, headers) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 2c409f7227a4..23ac6b42c9c8 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -55,7 +55,7 @@ jobs: skip-distrib-tests: 1 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 06a6073ad3d2..36bed32e9326 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,15 +15,15 @@ repos: exclude_types: ["python", "jupyter", "shell", "gitignore"] - repo: https://github.com/omnilib/ufmt - rev: v1.3.1 + rev: v2.2.0 hooks: - id: ufmt additional_dependencies: - - black == 21.12b0 - - usort == 1.0.1 + - black == 23.9.1 + - usort == 1.0.7 - repo: https://github.com/pycqa/flake8 - rev: 6.0.0 + rev: 6.1.0 hooks: - id: flake8 args: ["--config", "setup.cfg"] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d4bc78a5770..fd41e6abf86d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -57,7 +57,7 @@ into the following categories:
-- Install [miniconda](https://docs.conda.io/projects/continuumio-conda/en/latest/user-guide/install/index.html) for your system. +- Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) for your system. - Create an isolated conda environment for pytorch-ignite: ```bash @@ -244,13 +244,12 @@ If you are not familiar with creating a Pull Request, here are some guides: **NOTE : When sending a PR, please kindly check if the changes are required to run in the CI.** For example, typo changes in `CONTRIBUTING.md`, `README.md` are not required to run in the CI. -So, please add `[skip ci]` in the PR title to save the resources. Ignite has setup 3 CIs. +So, please add `[skip ci]` in the PR title to save the resources. Ignite has setup several CIs. - GitHub Actions -- CircleCI - Netlify -CircleCI is disabled on forked PR. So, please add +So, please add - `[skip actions]` for the changes which are not required to run on GitHub Actions, - `[skip netlify]` for the changes which are not required to run on Netlify PR Preview build, or diff --git a/README.md b/README.md index ea791b225e7c..58c95cc8d092 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,11 @@ -| ![image](https://img.shields.io/badge/-Tests:-black?style=flat-square) [![image](https://github.com/pytorch/ignite/workflows/Run%20unit%20tests/badge.svg)](https://github.com/pytorch/ignite/actions) [![image](https://img.shields.io/badge/-GPU%20tests-black?style=flat-square)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master)[![image](https://circleci.com/gh/pytorch/ignite.svg?style=svg)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master) [![image](https://codecov.io/gh/pytorch/ignite/branch/master/graph/badge.svg)](https://codecov.io/gh/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/ignite/index.html) | +| ![image](https://img.shields.io/badge/-Tests:-black?style=flat-square) [![image](https://github.com/pytorch/ignite/actions/workflows/unit-tests.yml/badge.svg?branch=master)](https://github.com/pytorch/ignite/actions/workflows/unit-tests.yml) [![image](https://github.com/pytorch/ignite/actions/workflows/gpu-tests.yml/badge.svg)](https://github.com/pytorch/ignite/actions/workflows/gpu-tests.yml) [![image](https://codecov.io/gh/pytorch/ignite/branch/master/graph/badge.svg)](https://codecov.io/gh/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/ignite/index.html) | |:--- -| ![image](https://img.shields.io/badge/-Stable%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch/ignite/badges/version.svg)](https://anaconda.org/pytorch/ignite) [![image](https://anaconda.org/pytorch/ignite/badges/downloads.svg)](https://anaconda.org/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=PyPI&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pypi.org/project/pytorch-ignite/) [![image](https://pepy.tech/badge/pytorch-ignite)](https://pepy.tech/project/pytorch-ignite) | +| ![image](https://img.shields.io/badge/-Stable%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch/ignite/badges/version.svg)](https://anaconda.org/pytorch/ignite) ・ [![image](https://img.shields.io/badge/dynamic/json.svg?label=PyPI&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pypi.org/project/pytorch-ignite/) [![image](https://static.pepy.tech/badge/pytorch-ignite)](https://pepy.tech/project/pytorch-ignite) ・ [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) | | ![image](https://img.shields.io/badge/-Nightly%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch-nightly/ignite/badges/version.svg)](https://anaconda.org/pytorch-nightly/ignite) [![image](https://img.shields.io/badge/PyPI-pre%20releases-brightgreen)](https://pypi.org/project/pytorch-ignite/#history)| -| ![image](https://img.shields.io/badge/-Features:-black?style=flat-square) [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) [![image](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org) [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) | -| ![image](https://img.shields.io/badge/-Community:-black?style=flat-square) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite) [![facebook](https://img.shields.io/badge/news-facebook-blue)](https://www.facebook.com/PyTorch-Ignite-Community-105837321694508) [![numfocus](https://img.shields.io/badge/NumFOCUS-affiliated%20project-green)](https://numfocus.org/sponsored-projects/affiliated-projects) [![discord](https://img.shields.io/badge/chat-discord-blue?logo=discord)](https://discord.gg/djZtm3EmKj) | +| ![image](https://img.shields.io/badge/-Community:-black?style=flat-square) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite) [![discord](https://img.shields.io/badge/chat-discord-blue?logo=discord)](https://discord.gg/djZtm3EmKj) [![numfocus](https://img.shields.io/badge/NumFOCUS-affiliated%20project-green)](https://numfocus.org/sponsored-projects/affiliated-projects) | | ![image](https://img.shields.io/badge/-Supported_PyTorch/Python_versions:-black?style=flat-square) [![link](https://img.shields.io/badge/-check_here-blue)](https://github.com/pytorch/ignite/actions?query=workflow%3A%22PyTorch+version+tests%22)| @@ -346,8 +345,8 @@ For more details, see [here](docker). Few pointers to get you started: -- [Quick Start Guide: Essentials of getting a project up and running](https://pytorch.org/ignite/quickstart.html) -- [Concepts of the library: Engine, Events & Handlers, State, Metrics](https://pytorch.org/ignite/concepts.html) +- [Quick Start Guide: Essentials of getting a project up and running](https://pytorch-ignite.ai/tutorials/beginner/01-getting-started/) +- [Concepts of the library: Engine, Events & Handlers, State, Metrics](https://pytorch-ignite.ai/concepts/) - Full-featured template examples (coming soon) @@ -398,7 +397,7 @@ Few pointers to get you started: torch.cuda.amp vs nvidia/apex](https://github.com/pytorch/ignite/blob/master/examples/notebooks/Cifar100_bench_amp.ipynb) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb) [MNIST training on a single TPU](https://github.com/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb) -- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E9zJrptnLJ_PKhmaP5Vhb6DTVRvyrKHx) [CIFAR10 Training on multiple TPUs](https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10) +- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E9zJrptnLJ_PKhmaP5Vhb6DTVRvyrKHx) [CIFAR10 Training on multiple TPUs](https://github.com/pytorch/ignite/tree/master/examples/cifar10) - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/HandlersTimeProfiler_MNIST.ipynb) [Basic example of handlers time profiling on MNIST training example](https://github.com/pytorch/ignite/blob/master/examples/notebooks/HandlersTimeProfiler_MNIST.ipynb) diff --git a/assets/tldr/teaser.ipynb b/assets/tldr/teaser.ipynb index f96ba8311b41..a4ac1bfed6db 100644 --- a/assets/tldr/teaser.ipynb +++ b/assets/tldr/teaser.ipynb @@ -454,7 +454,7 @@ "source": [ "### Other links\n", "\n", - "- Full featured CIFAR10 example: https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10\n" + "- Full featured CIFAR10 example: https://github.com/pytorch/ignite/tree/master/examples/cifar10\n" ] }, { diff --git a/assets/tldr/teaser.py b/assets/tldr/teaser.py index 8d8b59d45ee2..671f22c81af3 100644 --- a/assets/tldr/teaser.py +++ b/assets/tldr/teaser.py @@ -220,4 +220,4 @@ def evaluate_model(): # Full featured CIFAR10 example: -# https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10 +# https://github.com/pytorch/ignite/tree/master/examples/cifar10 diff --git a/docker/docker.cfg b/docker/docker.cfg index a5fcf2f5127a..1c98e0b6aee6 100644 --- a/docker/docker.cfg +++ b/docker/docker.cfg @@ -1,4 +1,4 @@ [DEFAULT] -build_docker_image_pytorch_version = 1.13.1-cuda11.6-cudnn8 +build_docker_image_pytorch_version = 2.0.0-cuda11.7-cudnn8 build_docker_image_hvd_version = v0.27.0 build_docker_image_msdp_version = v0.8.1 diff --git a/docker/test_image.py b/docker/test_image.py index 2be554079794..88033742f37a 100644 --- a/docker/test_image.py +++ b/docker/test_image.py @@ -21,7 +21,6 @@ def run_python_cmd(cmd): - try_except_cmd = f""" import warnings warnings.filterwarnings("ignore") @@ -65,7 +64,6 @@ def main(): if __name__ == "__main__": - parser = argparse.ArgumentParser("Check docker image script") parser.add_argument("image", type=str, help="Docker image to check") args = parser.parse_args() diff --git a/docs/Makefile b/docs/Makefile index eedf03332d70..3d1f9ada6a8b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -13,7 +13,7 @@ help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) docset: html - doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/ignite/ --force $(BUILDDIR)/html/ + doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url https://pytorch.org/ignite/ --force $(BUILDDIR)/html/ # Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution. cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png diff --git a/docs/requirements.txt b/docs/requirements.txt index 874e5773ba39..9a88587a0ee2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ -sphinx==5.0.0 +sphinx<6 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinxcontrib-katex sphinx-copybutton==0.4.0 diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html index 3378ba793509..750930d09fdc 100644 --- a/docs/source/_templates/layout.html +++ b/docs/source/_templates/layout.html @@ -516,7 +516,7 @@

Resources

}); }) - + diff --git a/docs/source/conf.py b/docs/source/conf.py index 74ae5fba71e8..2256d425becf 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -144,7 +144,7 @@ # 'https://fonts.googleapis.com/css?family=Lato', # '_static/css/pytorch_theme.css' "_static/css/ignite_theme.css", - "https://cdn.jsdelivr.net/npm/@docsearch/css@3.3.0/dist/style.min.css", + "https://cdn.jsdelivr.net/npm/@docsearch/css@3", ], } @@ -346,6 +346,12 @@ def run(self): ("py:class", "torch.utils.data.dataloader.DataLoader"), ] +linkcheck_ignore = [ + "https://github.com/fossasia/visdom#visdom-arguments-python-only", + "https://github.com/pytorch/ignite/tree/master/examples/cifar10#check-resume-training", + "https://github.com/pytorch/ignite/tree/master/examples/mnist#training-save--resume", +] + def setup(app): app.add_directive("autosummary", AutolistAutosummary, override=True) diff --git a/docs/source/contrib/handlers.rst b/docs/source/contrib/handlers.rst index 275b5f271093..1635b7d5bb99 100644 --- a/docs/source/contrib/handlers.rst +++ b/docs/source/contrib/handlers.rst @@ -50,15 +50,15 @@ Loggers Below are a comprehensive list of examples of various loggers. - * See `tensorboardX mnist example `_ + * See `tensorboardX mnist example `_ and `CycleGAN and EfficientNet notebooks `_ for detailed usage. - * See `visdom mnist example `_ for detailed usage. + * See `visdom mnist example `_ for detailed usage. - * See `neptune mnist example `_ for detailed usage. + * See `neptune mnist example `_ for detailed usage. - * See `tqdm mnist example `_ for detailed usage. + * See `tqdm mnist example `_ for detailed usage. - * See `wandb mnist example `_ for detailed usage. + * See `wandb mnist example `_ for detailed usage. - * See `clearml mnist example `_ for detailed usage. + * See `clearml mnist example `_ for detailed usage. diff --git a/docs/source/contrib/metrics.rst b/docs/source/contrib/metrics.rst index a4264131b6bd..eccaf9e78084 100644 --- a/docs/source/contrib/metrics.rst +++ b/docs/source/contrib/metrics.rst @@ -29,7 +29,7 @@ Module :mod:`ignite.contrib.metrics.regression` provides implementations of metrics useful for regression tasks. Definitions of metrics are based on `Botchkarev 2018`_, page 30 "Appendix 2. Metrics mathematical definitions". .. _`Botchkarev 2018`: - https://arxiv.org/ftp/arxiv/papers/1809/1809.03006.pdf + https://arxiv.org/abs/1809.03006 Complete list of metrics: diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index a72a33caa373..43089924318c 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -56,7 +56,7 @@ in the code. For more details, please, see :class:`~ignite.distributed.launcher. :meth:`~ignite.distributed.auto.auto_dataloader`. Complete example of CIFAR10 training can be found -`here `_. +`here `_. .. _torch.distributed.launch: https://pytorch.org/docs/stable/distributed.html#launch-utility diff --git a/docs/source/engine.rst b/docs/source/engine.rst index 48fa9cc576c4..6b865d1849c4 100644 --- a/docs/source/engine.rst +++ b/docs/source/engine.rst @@ -69,7 +69,7 @@ Resuming the training It is possible to resume the training from a checkpoint and approximately reproduce original run's behaviour. Using Ignite, this can be easily done using :class:`~ignite.handlers.checkpoint.Checkpoint` handler. Engine provides two methods to serialize and deserialize its internal state :meth:`~ignite.engine.engine.Engine.state_dict` and -:meth:`~ignite.engine.engine.Engine.load_state_dict`. In addition to serializing model, optimizer, lr scheduler etc user can +:meth:`~ignite.engine.engine.Engine.load_state_dict`. In addition to serializing model, optimizer, lr scheduler, metrics, etc., user can store the trainer and then resume the training. For example: .. code-block:: python @@ -82,8 +82,9 @@ store the trainer and then resume the training. For example: optimizer = ... lr_scheduler = ... data_loader = ... + metric = ... - to_save = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler} + to_save = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'metric': metric} handler = Checkpoint(to_save, DiskSaver('/tmp/training', create_dir=True)) trainer.add_event_handler(Events.EPOCH_COMPLETED, handler) trainer.run(data_loader, max_epochs=100) @@ -104,8 +105,9 @@ We can then restore the training from the last checkpoint. optimizer = ... lr_scheduler = ... data_loader = ... + metric = ... - to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler} + to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'metric': metric} checkpoint = torch.load(checkpoint_file) Checkpoint.load_objects(to_load=to_load, checkpoint=checkpoint) @@ -117,8 +119,8 @@ from iteration. Complete examples that resumes the training from a checkpoint can be found here: -- `save/resume MNIST `_ -- `save/resume Distributed CIFAR10 `_ +- `save/resume MNIST `_ +- `save/resume Distributed CIFAR10 `_ Deterministic training ---------------------- @@ -213,8 +215,8 @@ We can see that the data samples are exactly the same between original and resum Complete examples that simulates a crash on a defined iteration and resumes the training from a checkpoint can be found here: -- `save/resume MNIST `_ -- `save/resume Distributed CIFAR10 `_ +- `save/resume MNIST `_ +- `save/resume Distributed CIFAR10 `_ .. Note :: diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst index afc477f457e1..bd5038f08140 100644 --- a/docs/source/metrics.rst +++ b/docs/source/metrics.rst @@ -290,7 +290,10 @@ Complete list of usages - :class:`~ignite.metrics.metric.MetricUsage` - :class:`~ignite.metrics.metric.EpochWise` + - :class:`~ignite.metrics.metric.RunningEpochWise` - :class:`~ignite.metrics.metric.BatchWise` + - :class:`~ignite.metrics.metric.RunningBatchWise` + - :class:`~ignite.metrics.metric.SingleEpochRunningBatchWise` - :class:`~ignite.metrics.metric.BatchFiltered` Metrics and distributed computations @@ -359,10 +362,22 @@ EpochWise ~~~~~~~~~ .. autoclass:: ignite.metrics.metric.EpochWise +RunningEpochWise +~~~~~~~~~~~~~~~~ +.. autoclass:: ignite.metrics.metric.RunningEpochWise + BatchWise ~~~~~~~~~ .. autoclass:: ignite.metrics.metric.BatchWise +RunningBatchWise +~~~~~~~~~~~~~~~~ +.. autoclass:: ignite.metrics.metric.RunningBatchWise + +SingleEpochRunningBatchWise +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autoclass:: ignite.metrics.metric.SingleEpochRunningBatchWise + BatchFiltered ~~~~~~~~~~~~~ .. autoclass:: ignite.metrics.metric.BatchFiltered diff --git a/examples/contrib/cifar10/.gitignore b/examples/cifar10/.gitignore similarity index 100% rename from examples/contrib/cifar10/.gitignore rename to examples/cifar10/.gitignore diff --git a/examples/contrib/cifar10/README.md b/examples/cifar10/README.md similarity index 100% rename from examples/contrib/cifar10/README.md rename to examples/cifar10/README.md diff --git a/examples/contrib/cifar10/main.py b/examples/cifar10/main.py similarity index 95% rename from examples/contrib/cifar10/main.py rename to examples/cifar10/main.py index e62d1b41d018..5696f37aa7cf 100644 --- a/examples/contrib/cifar10/main.py +++ b/examples/cifar10/main.py @@ -20,7 +20,6 @@ def training(local_rank, config): - rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() @@ -47,11 +46,7 @@ def training(local_rank, config): config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task + from clearml import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) @@ -205,22 +200,13 @@ def run( raise RuntimeError("The value of with_amp should be False if backend is xla") with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: - parallel.run(training, config) def get_dataflow(config): # - Get train/test datasets - if idist.get_local_rank() > 0: - # Ensure that only local rank 0 download the dataset - # Thus each node will download a copy of the dataset - idist.barrier() - - train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"]) - - if idist.get_local_rank() == 0: - # Ensure that only local rank 0 download the dataset - idist.barrier() + with idist.one_rank_first(local=True): + train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"]) # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( @@ -291,7 +277,6 @@ def log_basic_info(logger, config): def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger): - device = idist.device() # Setup Ignite trainer: @@ -307,7 +292,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): - x, y = batch[0], batch[1] if x.device != device: diff --git a/examples/contrib/cifar10/requirements.txt b/examples/cifar10/requirements.txt similarity index 100% rename from examples/contrib/cifar10/requirements.txt rename to examples/cifar10/requirements.txt diff --git a/examples/contrib/cifar10/utils.py b/examples/cifar10/utils.py similarity index 100% rename from examples/contrib/cifar10/utils.py rename to examples/cifar10/utils.py diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_fp32.py b/examples/cifar100_amp_benchmark/benchmark_fp32.py similarity index 100% rename from examples/contrib/cifar100_amp_benchmark/benchmark_fp32.py rename to examples/cifar100_amp_benchmark/benchmark_fp32.py diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_nvidia_apex.py b/examples/cifar100_amp_benchmark/benchmark_nvidia_apex.py similarity index 100% rename from examples/contrib/cifar100_amp_benchmark/benchmark_nvidia_apex.py rename to examples/cifar100_amp_benchmark/benchmark_nvidia_apex.py diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py similarity index 100% rename from examples/contrib/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py rename to examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py diff --git a/examples/contrib/cifar100_amp_benchmark/utils.py b/examples/cifar100_amp_benchmark/utils.py similarity index 100% rename from examples/contrib/cifar100_amp_benchmark/utils.py rename to examples/cifar100_amp_benchmark/utils.py diff --git a/examples/contrib/cifar10_qat/.gitignore b/examples/cifar10_qat/.gitignore similarity index 100% rename from examples/contrib/cifar10_qat/.gitignore rename to examples/cifar10_qat/.gitignore diff --git a/examples/contrib/cifar10_qat/README.md b/examples/cifar10_qat/README.md similarity index 100% rename from examples/contrib/cifar10_qat/README.md rename to examples/cifar10_qat/README.md diff --git a/examples/contrib/cifar10_qat/main.py b/examples/cifar10_qat/main.py similarity index 95% rename from examples/contrib/cifar10_qat/main.py rename to examples/cifar10_qat/main.py index 98b0bb10850a..f70567969525 100644 --- a/examples/contrib/cifar10_qat/main.py +++ b/examples/cifar10_qat/main.py @@ -19,7 +19,6 @@ def training(local_rank, config): - rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() @@ -43,11 +42,7 @@ def training(local_rank, config): config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task + from clearml import Task task = Task.init("CIFAR10-Training", task_name=output_path.stem) task.connect_configuration(config) @@ -189,22 +184,13 @@ def run( spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: - parallel.run(training, config) def get_dataflow(config): # - Get train/test datasets - if idist.get_local_rank() > 0: - # Ensure that only local rank 0 download the dataset - # Thus each node will download a copy of the dataset - idist.barrier() - - train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"]) - - if idist.get_local_rank() == 0: - # Ensure that only local rank 0 download the dataset - idist.barrier() + with idist.one_rank_first(local=True): + train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"]) # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( @@ -275,7 +261,6 @@ def log_basic_info(logger, config): def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger): - device = idist.device() # Setup Ignite trainer: @@ -291,7 +276,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): - x, y = batch[0], batch[1] if x.device != device: diff --git a/examples/contrib/cifar10_qat/pact.py b/examples/cifar10_qat/pact.py similarity index 100% rename from examples/contrib/cifar10_qat/pact.py rename to examples/cifar10_qat/pact.py diff --git a/examples/contrib/cifar10_qat/utils.py b/examples/cifar10_qat/utils.py similarity index 100% rename from examples/contrib/cifar10_qat/utils.py rename to examples/cifar10_qat/utils.py diff --git a/examples/contrib/mnist/README.md b/examples/contrib/mnist/README.md deleted file mode 100644 index 5d5955d27a57..000000000000 --- a/examples/contrib/mnist/README.md +++ /dev/null @@ -1,67 +0,0 @@ -# Basic MNIST Example with Ignite and `ignite.contrib` module - -ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/mnist) - -Basic neural network training with Ignite and various built-in loggers from `ignite.contrib`: - -- TQDM progress bar -- Tensorboard -- Visdom - -### Usage: - -#### Requirements: - -- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` -- [tqdm](https://github.com/tqdm/tqdm/): `pip install tqdm` - -#### Logging with TQDM progress bar - -Run the example: - -``` -python mnist_with_tqdm_logger.py -``` - -### Logging with TensorboardX or `torch.utils.tensorboard` - -Example with training and validation monitoring using Tensorboard. - -#### Requirements: - -- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` -- Optionally [TensorboardX](https://github.com/lanpa/tensorboard-pytorch): `pip install tensorboardX` -- Tensorboard: `pip install tensorboard` - -Optionally, user can install `pynvml` package on Python 3 and log GPU information: used memory, utilization. - -#### Usage: - -Run the example: - -```bash -python mnist_with_tensorboard_logger.py --log_dir=/tmp/tensorboard_logs -``` - -Start tensorboard: - -```bash -tensorboard --logdir=/tmp/tensorboard_logs/ -``` - -### Logging with Visdom - -Example with training and validation monitoring using Visdom - -#### Requirements: - -- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` -- [Visdom](https://github.com/facebookresearch/visdom): `pip install visdom` - -#### Usage: - -Run the example: - -```bash -python mnist_with_visdom_logger.py -``` diff --git a/examples/fast_neural_style/neural_style.py b/examples/fast_neural_style/neural_style.py index 5aa072171e95..9af5f1888ef5 100644 --- a/examples/fast_neural_style/neural_style.py +++ b/examples/fast_neural_style/neural_style.py @@ -78,7 +78,6 @@ def train(args): running_avgs = OrderedDict() def step(engine, batch): - x, _ = batch x = x.to(device) diff --git a/examples/fast_neural_style/utils.py b/examples/fast_neural_style/utils.py index e992fcc4fade..b3671d55f6c7 100644 --- a/examples/fast_neural_style/utils.py +++ b/examples/fast_neural_style/utils.py @@ -4,9 +4,9 @@ def load_image(filename, size=None, scale=None): img = Image.open(filename) if size is not None: - img = img.resize((size, size), Image.ANTIALIAS) + img = img.resize((size, size), Image.LANCZOS) elif scale is not None: - img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS) + img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.LANCZOS) return img diff --git a/examples/fast_neural_style/vgg.py b/examples/fast_neural_style/vgg.py index 2e9ad1f2d135..05950246f0ff 100644 --- a/examples/fast_neural_style/vgg.py +++ b/examples/fast_neural_style/vgg.py @@ -2,12 +2,13 @@ import torch from torchvision import models +from torchvision.models.vgg import VGG16_Weights class Vgg16(torch.nn.Module): def __init__(self, requires_grad=False): super(Vgg16, self).__init__() - vgg_pretrained_features = models.vgg16(pretrained=True).features + vgg_pretrained_features = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1).features self.slice1 = torch.nn.Sequential() self.slice2 = torch.nn.Sequential() self.slice3 = torch.nn.Sequential() diff --git a/examples/gan/dcgan.py b/examples/gan/dcgan.py index cbf804c7c97c..ce9d3d325160 100644 --- a/examples/gan/dcgan.py +++ b/examples/gan/dcgan.py @@ -207,7 +207,6 @@ def main( alpha, output_dir, ): - # seed check_manual_seed(seed) @@ -243,7 +242,6 @@ def get_noise(): # The main function, processing a batch of examples def step(engine, batch): - # unpack the batch. It comes from a dataset, so we have pairs. Discard labels. real, _ = batch real = real.to(device) diff --git a/examples/mnist/README.md b/examples/mnist/README.md index cac50fc14195..3523cd37615a 100644 --- a/examples/mnist/README.md +++ b/examples/mnist/README.md @@ -2,7 +2,7 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/mnist) -#### Requirements: +#### Minimal requirements: - [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` - [tqdm](https://github.com/tqdm/tqdm/): `pip install tqdm` @@ -11,22 +11,25 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/m Run the example: -``` +```bash python mnist.py ``` +Same example with logging using TQDM progress bar + + +```bash +python mnist_with_tqdm_logger.py +``` + ### Logging with Tensorboard -MNIST example with training and validation monitoring using Tensorboard. Notice -that if PyTorch version is less than 1.2, the module TensorboardX is required. +MNIST example with training and validation monitoring using Tensorboard -#### Requirements: +#### Additional requirements: -- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` -- [TensorboardX](https://github.com/lanpa/tensorboard-pytorch) (if and only if `PyTorch <= 1.2`): `pip install tensorboardX` - Tensorboard: `pip install tensorboard` -#### Usage: Run the example: @@ -44,9 +47,8 @@ tensorboard --logdir=/tmp/tensorboard_logs/ MNIST example with training and validation monitoring using Visdom -#### Requirements: +#### Additional requirements: -- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision` - [Visdom](https://github.com/facebookresearch/visdom): `pip install visdom` #### Usage: @@ -63,6 +65,18 @@ Run the example: python mnist_with_visdom.py ``` +### Logging with ClearML + +#### Additional requirements: + +- [ClearML python client](https://clear.ml/docs/latest/docs/): `pip install clearml` + +#### Usage: + +```bash +python mnist_with_clearml_logger.py +``` + ### Training save & resume Example shows how to save a checkpoint of the trainer, model, optimizer, lr scheduler. diff --git a/examples/mnist/mnist_save_resume_engine.py b/examples/mnist/mnist_save_resume_engine.py index 2fd61cb0c113..4bdacec24f72 100644 --- a/examples/mnist/mnist_save_resume_engine.py +++ b/examples/mnist/mnist_save_resume_engine.py @@ -13,7 +13,7 @@ from ignite.engine import create_supervised_evaluator, create_supervised_trainer, Events from ignite.handlers import Checkpoint, DiskSaver -from ignite.metrics import Accuracy, Loss +from ignite.metrics import Accuracy, Loss, RunningAverage from ignite.utils import manual_seed try: @@ -162,10 +162,11 @@ def run( if deterministic: tqdm.write("Setup deterministic trainer") trainer = create_supervised_trainer(model, optimizer, criterion, device=device, deterministic=deterministic) + running_loss = RunningAverage(output_transform=lambda x: x) + running_loss.attach(trainer, "rloss") - evaluator = create_supervised_evaluator( - model, metrics={"accuracy": Accuracy(), "nll": Loss(criterion)}, device=device - ) + metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)} + evaluator = create_supervised_evaluator(model, metrics, device) # Apply learning rate scheduling @trainer.on(Events.EPOCH_COMPLETED) @@ -177,9 +178,10 @@ def lr_step(engine): @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) def log_training_loss(engine): lr = optimizer.param_groups[0]["lr"] - pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr:.4f}" + rloss = engine.state.metrics["rloss"] + pbar.desc = f"Epoch {engine.state.epoch} - loss: {rloss:.4f} - lr: {lr:.4f}" pbar.update(log_interval) - writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) + writer.add_scalar("training/running_loss", rloss, engine.state.iteration) writer.add_scalar("lr", lr, engine.state.iteration) if crash_iteration > 0: @@ -222,7 +224,14 @@ def log_validation_results(engine): writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) # Setup object to checkpoint - objects_to_checkpoint = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler} + objects_to_checkpoint = { + "trainer": trainer, + "model": model, + "optimizer": optimizer, + "lr_scheduler": lr_scheduler, + "train_running_loss": running_loss, + "metrics": metrics, + } training_checkpoint = Checkpoint( to_save=objects_to_checkpoint, save_handler=DiskSaver(log_dir, require_empty=False), diff --git a/examples/contrib/mnist/mnist_with_clearml_logger.py b/examples/mnist/mnist_with_clearml_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_clearml_logger.py rename to examples/mnist/mnist_with_clearml_logger.py diff --git a/examples/contrib/mnist/mnist_with_neptune_logger.py b/examples/mnist/mnist_with_neptune_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_neptune_logger.py rename to examples/mnist/mnist_with_neptune_logger.py diff --git a/examples/contrib/mnist/mnist_with_tensorboard_logger.py b/examples/mnist/mnist_with_tensorboard_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_tensorboard_logger.py rename to examples/mnist/mnist_with_tensorboard_logger.py diff --git a/examples/contrib/mnist/mnist_with_tqdm_logger.py b/examples/mnist/mnist_with_tqdm_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_tqdm_logger.py rename to examples/mnist/mnist_with_tqdm_logger.py diff --git a/examples/contrib/mnist/mnist_with_visdom_logger.py b/examples/mnist/mnist_with_visdom_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_visdom_logger.py rename to examples/mnist/mnist_with_visdom_logger.py diff --git a/examples/contrib/mnist/mnist_with_wandb_logger.py b/examples/mnist/mnist_with_wandb_logger.py similarity index 100% rename from examples/contrib/mnist/mnist_with_wandb_logger.py rename to examples/mnist/mnist_with_wandb_logger.py diff --git a/examples/notebooks/Cifar100_bench_amp.ipynb b/examples/notebooks/Cifar100_bench_amp.ipynb index d7c713b49896..dc9cfc750d93 100644 --- a/examples/notebooks/Cifar100_bench_amp.ipynb +++ b/examples/notebooks/Cifar100_bench_amp.ipynb @@ -1,29 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "colab": { - "name": "Cifar100_bench_amp.ipynb", - "provenance": [] - } - }, "cells": [ { "cell_type": "markdown", @@ -63,14 +38,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SkRXPuNRfDHX" }, + "outputs": [], "source": [ "!pip install pytorch-ignite pynvml fire" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -83,32 +58,32 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "fGtxgbj8fDHb" }, + "outputs": [], "source": [ "# Install Apex:\n", "# If torch cuda version and nvcc version match:\n", "!pip install --upgrade --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" git+https://github.com/NVIDIA/apex/\n", "# if above command is failing, please install apex without c++/cuda extensions:\n", "# !pip install --upgrade --no-cache-dir git+https://github.com/NVIDIA/apex/" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "QnihHXQpfDHb" }, + "outputs": [], "source": [ "import torch\n", "import torchvision\n", "import ignite\n", "torch.__version__, torchvision.__version__, ignite.__version__" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -121,16 +96,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6xqqj0q1fDHh" }, + "outputs": [], "source": [ "!git clone https://github.com/pytorch/ignite.git /tmp/ignite\n", - "scriptspath=\"/tmp/ignite/examples/contrib/cifar100_amp_benchmark/\"\n", + "scriptspath=\"/tmp/ignite/examples/cifar100_amp_benchmark/\"\n", "setup=f\"cd {scriptspath} && export PYTHONPATH=$PWD:$PYTHONPATH\"" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -143,15 +118,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ulufk4tsfDHj" }, + "outputs": [], "source": [ "from torchvision.datasets.cifar import CIFAR100\n", "CIFAR100(root=\"/tmp/cifar100/\", train=True, download=True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -164,14 +139,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "mHwsVTB6fDHq" }, + "outputs": [], "source": [ "!{setup} && python benchmark_fp32.py /tmp/cifar100/ --batch_size=256 --max_epochs=20" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -184,14 +159,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "xkuW1EY-fDHs" }, + "outputs": [], "source": [ "!{setup} && python benchmark_torch_cuda_amp.py /tmp/cifar100/ --batch_size=256 --max_epochs=20" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -209,25 +184,50 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "A6Pe4cW6fDHu" }, + "outputs": [], "source": [ "!{setup} && python benchmark_nvidia_apex.py /tmp/cifar100/ --batch_size=256 --max_epochs=20 --opt=\"O1\"" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "1aqdlPSgfDHu" }, + "outputs": [], "source": [ "!{setup} && python benchmark_nvidia_apex.py /tmp/cifar100/ --batch_size=256 --max_epochs=20 --opt=\"O2\"" - ], - "execution_count": null, - "outputs": [] + ] } - ] + ], + "metadata": { + "colab": { + "name": "Cifar100_bench_amp.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/references/classification/imagenet/dataflow.py b/examples/references/classification/imagenet/dataflow.py index 4d422d9e26a0..e497be3bcceb 100644 --- a/examples/references/classification/imagenet/dataflow.py +++ b/examples/references/classification/imagenet/dataflow.py @@ -19,7 +19,6 @@ def opencv_loader(path): def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs): - if limit_num_samples is not None: g = torch.Generator().manual_seed(limit_num_samples) indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples] @@ -38,7 +37,6 @@ def get_train_val_loaders( limit_train_num_samples: Optional[int] = None, limit_val_num_samples: Optional[int] = None, ) -> Tuple[DataLoader, DataLoader, DataLoader]: - train_ds = ImageFolder( Path(root_path) / "train", transform=lambda sample: train_transforms(image=sample)["image"], diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py index 8a001260d972..85c20c08a62b 100644 --- a/examples/references/classification/imagenet/main.py +++ b/examples/references/classification/imagenet/main.py @@ -24,7 +24,6 @@ def training(local_rank, config, logger, with_clearml): - rank = idist.get_rank() manual_seed(config.seed + local_rank) @@ -305,7 +304,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True): assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" with idist.Parallel(backend=backend) as parallel: - logger = setup_logger(name="ImageNet Training", distributed_rank=idist.get_rank()) config = ConfigObject(config_filepath) @@ -327,7 +325,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True): def get_model_weights(config, logger, with_clearml): - path = "" if with_clearml: from clearml import Model @@ -352,7 +349,6 @@ def get_model_weights(config, logger, with_clearml): def evaluation(local_rank, config, logger, with_clearml): - rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) @@ -428,5 +424,4 @@ def run_evaluation(config_filepath, backend="nccl", with_clearml=True): if __name__ == "__main__": - fire.Fire({"training": run_training, "eval": run_evaluation}) diff --git a/examples/references/classification/imagenet/utils.py b/examples/references/classification/imagenet/utils.py index 799a6069afd4..45be888d0de5 100644 --- a/examples/references/classification/imagenet/utils.py +++ b/examples/references/classification/imagenet/utils.py @@ -6,7 +6,6 @@ def initialize(config): - device = idist.device() model = config.model.to(device) diff --git a/examples/references/segmentation/pascal_voc2012/dataflow.py b/examples/references/segmentation/pascal_voc2012/dataflow.py index b3b462f7c64a..befa25b1e4f2 100644 --- a/examples/references/segmentation/pascal_voc2012/dataflow.py +++ b/examples/references/segmentation/pascal_voc2012/dataflow.py @@ -27,7 +27,6 @@ def __getitem__(self, index): class VOCSegmentationOpencv(VOCSegmentation): - target_names = [ "background", "aeroplane", @@ -114,7 +113,6 @@ def get_train_noval_sbdataset(root_path, return_meta=False): def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs): - if limit_num_samples is not None: g = torch.Generator().manual_seed(limit_num_samples) indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples] @@ -135,7 +133,6 @@ def get_train_val_loaders( limit_train_num_samples=None, limit_val_num_samples=None, ): - train_ds = get_train_dataset(root_path) val_ds = get_val_dataset(root_path) diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 257b14dea031..20afebbb7d36 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -49,7 +49,6 @@ def download_datasets(output_path): def training(local_rank, config, logger, with_clearml): - rank = idist.get_rank() manual_seed(config.seed + local_rank) @@ -342,7 +341,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True): assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" with idist.Parallel(backend=backend) as parallel: - logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank()) config = ConfigObject(config_filepath) @@ -364,7 +362,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True): def get_model_weights(config, logger, with_clearml): - path = "" if with_clearml: from clearml import Model @@ -389,7 +386,6 @@ def get_model_weights(config, logger, with_clearml): def evaluation(local_rank, config, logger, with_clearml): - rank = idist.get_rank() device = idist.device() manual_seed(config.seed + local_rank) @@ -472,5 +468,4 @@ def run_evaluation(config_filepath, backend="nccl", with_clearml=True): if __name__ == "__main__": - fire.Fire({"download": download_datasets, "training": run_training, "eval": run_evaluation}) diff --git a/examples/references/segmentation/pascal_voc2012/utils.py b/examples/references/segmentation/pascal_voc2012/utils.py index 799a6069afd4..45be888d0de5 100644 --- a/examples/references/segmentation/pascal_voc2012/utils.py +++ b/examples/references/segmentation/pascal_voc2012/utils.py @@ -6,7 +6,6 @@ def initialize(config): - device = idist.device() model = config.model.to(device) diff --git a/examples/reinforcement_learning/actor_critic.py b/examples/reinforcement_learning/actor_critic.py index 19c182cb2152..a62cfd72ee6f 100644 --- a/examples/reinforcement_learning/actor_critic.py +++ b/examples/reinforcement_learning/actor_critic.py @@ -122,7 +122,6 @@ def finish_episode(policy, optimizer, gamma): def main(env, args): - policy = Policy() optimizer = optim.Adam(policy.parameters(), lr=3e-2) timesteps = range(10000) @@ -185,7 +184,6 @@ def should_finish_training(): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Ignite actor-critic example") parser.add_argument("--gamma", type=float, default=0.99, metavar="G", help="discount factor (default: 0.99)") parser.add_argument("--seed", type=int, default=543, metavar="N", help="random seed (default: 1)") diff --git a/examples/reinforcement_learning/reinforce.py b/examples/reinforcement_learning/reinforce.py index 3daabfa16a45..d964b7c6315d 100644 --- a/examples/reinforcement_learning/reinforce.py +++ b/examples/reinforcement_learning/reinforce.py @@ -70,7 +70,6 @@ def finish_episode(policy, optimizer, gamma): def main(env, args): - policy = Policy() optimizer = optim.Adam(policy.parameters(), lr=1e-2) timesteps = range(10000) @@ -123,7 +122,6 @@ def should_finish_training(): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="PyTorch REINFORCE example") parser.add_argument("--gamma", type=float, default=0.99, metavar="G", help="discount factor (default: 0.99)") parser.add_argument("--seed", type=int, default=543, metavar="N", help="random seed (default: 543)") diff --git a/examples/siamese_network/siamese_network.py b/examples/siamese_network/siamese_network.py index d0a1bfb7e3de..bf4be27629a5 100644 --- a/examples/siamese_network/siamese_network.py +++ b/examples/siamese_network/siamese_network.py @@ -65,7 +65,6 @@ def forward_once(self, x): return output def forward(self, input1, input2, input3): - # pass the input through resnet output1 = self.forward_once(input1) output2 = self.forward_once(input2) @@ -180,7 +179,6 @@ def calculate_loss(input1, input2): def run(args, model, device, optimizer, train_loader, test_loader, lr_scheduler): - # using Triplet Margin Loss criterion = nn.TripletMarginLoss(p=2, margin=2.8) diff --git a/examples/super_resolution/main.py b/examples/super_resolution/main.py index 816d1caea7f2..08199a22e741 100644 --- a/examples/super_resolution/main.py +++ b/examples/super_resolution/main.py @@ -73,8 +73,12 @@ def __len__(self): return len(self.dataset) -trainset = torchvision.datasets.Caltech101(root="./data", download=True) -testset = torchvision.datasets.Caltech101(root="./data", download=False) +try: + trainset = torchvision.datasets.Caltech101(root="./data", download=True) + testset = torchvision.datasets.Caltech101(root="./data", download=False) +except RuntimeError: + print("Dataset download problem, exiting without error code") + exit(0) trainset_sr = SRDataset(trainset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size) testset_sr = SRDataset(testset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size) diff --git a/examples/super_resolution/model.py b/examples/super_resolution/model.py index 1f80c95d0643..4d2e3cab33ba 100644 --- a/examples/super_resolution/model.py +++ b/examples/super_resolution/model.py @@ -10,7 +10,7 @@ def __init__(self, upscale_factor): self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) - self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) + self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1)) self.pixel_shuffle = nn.PixelShuffle(upscale_factor) self._initialize_weights() diff --git a/examples/contrib/transformers/README.md b/examples/transformers/README.md similarity index 100% rename from examples/contrib/transformers/README.md rename to examples/transformers/README.md diff --git a/examples/contrib/transformers/dataset.py b/examples/transformers/dataset.py similarity index 100% rename from examples/contrib/transformers/dataset.py rename to examples/transformers/dataset.py diff --git a/examples/contrib/transformers/main.py b/examples/transformers/main.py similarity index 98% rename from examples/contrib/transformers/main.py rename to examples/transformers/main.py index 8c60c1fd7a9c..c879812b98df 100644 --- a/examples/contrib/transformers/main.py +++ b/examples/transformers/main.py @@ -22,7 +22,6 @@ def training(local_rank, config): - rank = idist.get_rank() manual_seed(config["seed"] + rank) device = idist.device() @@ -33,7 +32,6 @@ def training(local_rank, config): output_path = config["output_dir"] if rank == 0: - now = datetime.now().strftime("%Y%m%d-%H%M%S") folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}" output_path = Path(output_path) / folder_name @@ -46,11 +44,7 @@ def training(local_rank, config): config["cuda device name"] = torch.cuda.get_device_name(local_rank) if config["with_clearml"]: - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task + from clearml import Task task = Task.init("IMDB-Training", task_name=output_path.stem) task.connect_configuration(config) @@ -207,7 +201,6 @@ def run( spawn_kwargs["nproc_per_node"] = nproc_per_node with idist.Parallel(backend=backend, **spawn_kwargs) as parallel: - parallel.run(training, config) @@ -293,7 +286,6 @@ def log_basic_info(logger, config): def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger): - device = idist.device() # Setup Ignite trainer: @@ -309,7 +301,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con scaler = GradScaler(enabled=with_amp) def train_step(engine, batch): - input_batch = batch[0] labels = batch[1].view(-1, 1) diff --git a/examples/contrib/transformers/model.py b/examples/transformers/model.py similarity index 100% rename from examples/contrib/transformers/model.py rename to examples/transformers/model.py diff --git a/examples/contrib/transformers/requirements.txt b/examples/transformers/requirements.txt similarity index 100% rename from examples/contrib/transformers/requirements.txt rename to examples/transformers/requirements.txt diff --git a/examples/contrib/transformers/utils.py b/examples/transformers/utils.py similarity index 100% rename from examples/contrib/transformers/utils.py rename to examples/transformers/utils.py diff --git a/ignite/__init__.py b/ignite/__init__.py index 0185adce1238..0e6f65ca8b5e 100644 --- a/ignite/__init__.py +++ b/ignite/__init__.py @@ -6,4 +6,4 @@ import ignite.metrics import ignite.utils -__version__ = "0.5.0" +__version__ = "0.4.13" diff --git a/ignite/base/mixins.py b/ignite/base/mixins.py index 563ce66066f4..3ecb2922f039 100644 --- a/ignite/base/mixins.py +++ b/ignite/base/mixins.py @@ -4,7 +4,6 @@ class Serializable: - _state_dict_all_req_keys: Tuple = () _state_dict_one_of_opt_keys: Tuple = () diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index 95e4e09cb3b1..a688d2fb470a 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -33,6 +33,7 @@ from ignite.handlers.checkpoint import BaseSaveHandler from ignite.handlers.param_scheduler import ParamScheduler from ignite.metrics import RunningAverage +from ignite.metrics.metric import RunningBatchWise from ignite.utils import deprecated @@ -48,6 +49,7 @@ def setup_common_training_handlers( with_pbars: bool = True, with_pbar_on_iters: bool = True, log_every_iters: int = 100, + device: Optional[Union[str, torch.device]] = None, stop_on_nan: bool = True, clear_cuda_cache: bool = True, save_handler: Optional[Union[Callable, BaseSaveHandler]] = None, @@ -91,7 +93,10 @@ def setup_common_training_handlers( class to use to store ``to_save``. See :class:`~ignite.handlers.checkpoint.Checkpoint` for more details. Argument is mutually exclusive with ``output_path``. kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`. + device: deprecated argument, it will be removed in 0.4.14. """ + if device is not None: + warnings.warn("Argument device is unused and deprecated. It will be removed in 0.4.14") if idist.get_world_size() > 1: _setup_common_distrib_training_handlers( @@ -176,7 +181,6 @@ def _setup_common_training_handlers( trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache) if to_save is not None: - if output_path is None and save_handler is None: raise ValueError( "If to_save argument is provided then output_path or save_handler arguments should be also defined" @@ -210,8 +214,8 @@ def output_transform(x: Any, index: int, name: str) -> Any: ) for i, n in enumerate(output_names): - RunningAverage(output_transform=partial(output_transform, index=i, name=n), epoch_bound=False).attach( - trainer, n + RunningAverage(output_transform=partial(output_transform, index=i, name=n)).attach( + trainer, n, usage=RunningBatchWise() ) if with_pbars: @@ -242,7 +246,6 @@ def _setup_common_distrib_training_handlers( save_handler: Optional[Union[Callable, BaseSaveHandler]] = None, **kwargs: Any, ) -> None: - _setup_common_training_handlers( trainer, to_save=to_save, @@ -266,7 +269,7 @@ def _setup_common_distrib_training_handlers( @trainer.on(Events.EPOCH_STARTED) def distrib_set_epoch(engine: Engine) -> None: - cast(DistributedSampler, train_sampler).set_epoch(engine.state.epoch - 1) + train_sampler.set_epoch(engine.state.epoch - 1) def empty_cuda_cache(_: Engine) -> None: diff --git a/ignite/contrib/handlers/__init__.py b/ignite/contrib/handlers/__init__.py index 2db80fd2fd9a..0a6fe3edd5cd 100644 --- a/ignite/contrib/handlers/__init__.py +++ b/ignite/contrib/handlers/__init__.py @@ -1,4 +1,5 @@ from ignite.contrib.handlers.clearml_logger import ClearMLLogger +from ignite.contrib.handlers.custom_events import CustomPeriodicEvent from ignite.contrib.handlers.mlflow_logger import MLflowLogger from ignite.contrib.handlers.neptune_logger import NeptuneLogger from ignite.contrib.handlers.polyaxon_logger import PolyaxonLogger diff --git a/ignite/contrib/handlers/base_logger.py b/ignite/contrib/handlers/base_logger.py index 28cd8f64ef4c..1c4133e25508 100644 --- a/ignite/contrib/handlers/base_logger.py +++ b/ignite/contrib/handlers/base_logger.py @@ -32,7 +32,6 @@ def __init__( tag: Optional[str] = None, whitelist: Optional[Union[List[str], Callable[[str, nn.Parameter], bool]]] = None, ): - if not isinstance(model, torch.nn.Module): raise TypeError(f"Argument model should be of type torch.nn.Module, but given {type(model)}") @@ -41,15 +40,12 @@ def __init__( weights = {} if whitelist is None: - weights = dict(model.named_parameters()) elif callable(whitelist): - for n, p in model.named_parameters(): if whitelist(n, p): weights[n] = p else: - for n, p in model.named_parameters(): for item in whitelist: if n.startswith(item): @@ -91,7 +87,6 @@ def __init__( global_step_transform: Optional[Callable[[Engine, Union[str, Events]], int]] = None, state_attributes: Optional[List[str]] = None, ): - if metric_names is not None: if not (isinstance(metric_names, list) or (isinstance(metric_names, str) and metric_names == "all")): raise TypeError( @@ -185,7 +180,6 @@ def __init__( tag: Optional[str] = None, whitelist: Optional[Union[List[str], Callable[[str, nn.Parameter], bool]]] = None, ): - super(BaseWeightsScalarHandler, self).__init__(model, tag=tag, whitelist=whitelist) if not callable(reduction): @@ -239,7 +233,6 @@ def attach( return RemovableEventHandle(event_name, log_handler, engine) else: - if event_name not in State.event_to_attr: raise RuntimeError(f"Unknown event name '{event_name}'") diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 1bfbb1abc00a..99d3db640bd0 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -325,7 +325,6 @@ def __init__( ) def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, ClearMLLogger): raise RuntimeError("Handler OutputHandler works only with ClearMLLogger") @@ -481,14 +480,12 @@ def has_bias_in_name(n, p): """ def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, ClearMLLogger): raise RuntimeError("Handler WeightsScalarHandler works only with ClearMLLogger") global_step = engine.state.get_event_attrib_value(event_name) tag_prefix = f"{self.tag}/" if self.tag else "" for name, p in self.weights: - title_name, _, series_name = name.partition(".") logger.clearml_logger.report_scalar( title=f"{tag_prefix}weights_{self.reduction.__name__}/{title_name}", @@ -579,7 +576,6 @@ def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str, global_step = engine.state.get_event_attrib_value(event_name) tag_prefix = f"{self.tag}/" if self.tag else "" for name, p in self.weights: - title_name, _, series_name = name.partition(".") logger.grad_helper.add_histogram( @@ -826,7 +822,6 @@ def __init__( *args: Any, **kwargs: Any, ): - self._setup_check_clearml(logger, output_uri) if not dirname: diff --git a/ignite/contrib/handlers/custom_events.py b/ignite/contrib/handlers/custom_events.py new file mode 100644 index 000000000000..7eaa65863d52 --- /dev/null +++ b/ignite/contrib/handlers/custom_events.py @@ -0,0 +1,124 @@ +import warnings + +from ignite.engine import EventEnum, Events, State + + +class CustomPeriodicEvent: + """DEPRECATED. Use filtered events instead. + Handler to define a custom periodic events as a number of elapsed iterations/epochs + for an engine. + + When custom periodic event is created and attached to an engine, the following events are fired: + 1) K iterations is specified: + - `Events.ITERATIONS__STARTED` + - `Events.ITERATIONS__COMPLETED` + + 1) K epochs is specified: + - `Events.EPOCHS__STARTED` + - `Events.EPOCHS__COMPLETED` + + + Examples: + + .. code-block:: python + + from ignite.engine import Engine, Events + from ignite.contrib.handlers import CustomPeriodicEvent + + # Let's define an event every 1000 iterations + cpe1 = CustomPeriodicEvent(n_iterations=1000) + cpe1.attach(trainer) + + # Let's define an event every 10 epochs + cpe2 = CustomPeriodicEvent(n_epochs=10) + cpe2.attach(trainer) + + @trainer.on(cpe1.Events.ITERATIONS_1000_COMPLETED) + def on_every_1000_iterations(engine): + # run a computation after 1000 iterations + # ... + print(engine.state.iterations_1000) + + @trainer.on(cpe2.Events.EPOCHS_10_STARTED) + def on_every_10_epochs(engine): + # run a computation every 10 epochs + # ... + print(engine.state.epochs_10) + + + Args: + n_iterations (int, optional): number iterations of the custom periodic event + n_epochs (int, optional): number iterations of the custom periodic event. Argument is optional, but only one, + either n_iterations or n_epochs should defined. + + """ + + def __init__(self, n_iterations=None, n_epochs=None): + warnings.warn( + "CustomPeriodicEvent is deprecated since 0.4.0 and will be removed in 0.4.14. Use filtered events instead.", + DeprecationWarning, + ) + + if n_iterations is not None: + if not isinstance(n_iterations, int): + raise TypeError("Argument n_iterations should be an integer") + if n_iterations < 1: + raise ValueError("Argument n_iterations should be positive") + + if n_epochs is not None: + if not isinstance(n_epochs, int): + raise TypeError("Argument n_epochs should be an integer") + if n_epochs < 1: + raise ValueError("Argument n_epochs should be positive") + + if (n_iterations is None and n_epochs is None) or (n_iterations and n_epochs): + raise ValueError("Either n_iterations or n_epochs should be defined") + + if n_iterations: + prefix = "iterations" + self.state_attr = "iteration" + self.period = n_iterations + + if n_epochs: + prefix = "epochs" + self.state_attr = "epoch" + self.period = n_epochs + + self.custom_state_attr = "{}_{}".format(prefix, self.period) + event_name = "{}_{}".format(prefix.upper(), self.period) + setattr( + self, + "Events", + EventEnum("Events", " ".join(["{}_STARTED".format(event_name), "{}_COMPLETED".format(event_name)])), + ) + + # Update State.event_to_attr + for e in self.Events: + State.event_to_attr[e] = self.custom_state_attr + + # Create aliases + self._periodic_event_started = getattr(self.Events, "{}_STARTED".format(event_name)) + self._periodic_event_completed = getattr(self.Events, "{}_COMPLETED".format(event_name)) + + def _on_started(self, engine): + setattr(engine.state, self.custom_state_attr, 0) + + def _on_periodic_event_started(self, engine): + if getattr(engine.state, self.state_attr) % self.period == 1: + setattr(engine.state, self.custom_state_attr, getattr(engine.state, self.custom_state_attr) + 1) + engine.fire_event(self._periodic_event_started) + + def _on_periodic_event_completed(self, engine): + if getattr(engine.state, self.state_attr) % self.period == 0: + engine.fire_event(self._periodic_event_completed) + + def attach(self, engine): + engine.register_events(*self.Events) + + engine.add_event_handler(Events.STARTED, self._on_started) + engine.add_event_handler( + getattr(Events, "{}_STARTED".format(self.state_attr.upper())), self._on_periodic_event_started + ) + engine.add_event_handler( + getattr(Events, "{}_COMPLETED".format(self.state_attr.upper())), self._on_periodic_event_completed + ) diff --git a/ignite/contrib/handlers/mlflow_logger.py b/ignite/contrib/handlers/mlflow_logger.py index 72cab7133e49..edd71ee7006a 100644 --- a/ignite/contrib/handlers/mlflow_logger.py +++ b/ignite/contrib/handlers/mlflow_logger.py @@ -101,7 +101,6 @@ def __init__(self, tracking_uri: Optional[str] = None): self.active_run = mlflow.start_run() def __getattr__(self, attr: Any) -> Any: - import mlflow return getattr(mlflow, attr) @@ -230,7 +229,6 @@ def __init__( ) def __call__(self, engine: Engine, logger: MLflowLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, MLflowLogger): raise TypeError("Handler 'OutputHandler' works only with MLflowLogger") diff --git a/ignite/contrib/handlers/neptune_logger.py b/ignite/contrib/handlers/neptune_logger.py index 94e0c00a238e..41e4909dfff7 100644 --- a/ignite/contrib/handlers/neptune_logger.py +++ b/ignite/contrib/handlers/neptune_logger.py @@ -327,7 +327,6 @@ def __init__( ) def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, NeptuneLogger): raise TypeError("Handler OutputHandler works only with NeptuneLogger") @@ -491,7 +490,6 @@ def has_bias_in_name(n, p): """ def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, NeptuneLogger): raise TypeError("Handler WeightsScalarHandler works only with NeptuneLogger") diff --git a/ignite/contrib/handlers/polyaxon_logger.py b/ignite/contrib/handlers/polyaxon_logger.py index 8ec45983aa34..2a358cb7da94 100644 --- a/ignite/contrib/handlers/polyaxon_logger.py +++ b/ignite/contrib/handlers/polyaxon_logger.py @@ -238,7 +238,6 @@ def __init__( ) def __call__(self, engine: Engine, logger: PolyaxonLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, PolyaxonLogger): raise RuntimeError("Handler 'OutputHandler' works only with PolyaxonLogger") diff --git a/ignite/contrib/handlers/tensorboard_logger.py b/ignite/contrib/handlers/tensorboard_logger.py index 1cf8a393b822..531c25cf88c3 100644 --- a/ignite/contrib/handlers/tensorboard_logger.py +++ b/ignite/contrib/handlers/tensorboard_logger.py @@ -287,7 +287,6 @@ def __init__( ) def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, TensorboardLogger): raise RuntimeError("Handler 'OutputHandler' works only with TensorboardLogger") @@ -422,14 +421,12 @@ def has_bias_in_name(n, p): """ def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, TensorboardLogger): raise RuntimeError("Handler 'WeightsScalarHandler' works only with TensorboardLogger") global_step = engine.state.get_event_attrib_value(event_name) tag_prefix = f"{self.tag}/" if self.tag else "" for name, p in self.weights: - name = name.replace(".", "/") logger.writer.add_scalar( f"{tag_prefix}weights_{self.reduction.__name__}/{name}", @@ -509,7 +506,6 @@ def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[ global_step = engine.state.get_event_attrib_value(event_name) tag_prefix = f"{self.tag}/" if self.tag else "" for name, p in self.weights: - name = name.replace(".", "/") logger.writer.add_histogram( tag=f"{tag_prefix}weights/{name}", values=p.data.cpu().numpy(), global_step=global_step diff --git a/ignite/contrib/handlers/tqdm_logger.py b/ignite/contrib/handlers/tqdm_logger.py index fd909745337e..37d79b7c4a0b 100644 --- a/ignite/contrib/handlers/tqdm_logger.py +++ b/ignite/contrib/handlers/tqdm_logger.py @@ -90,7 +90,7 @@ class ProgressBar(BaseLogger): Note: - When adding attaching the progress bar to an engine, it is recommend that you replace + When attaching the progress bar to an engine, it is recommended that you replace every print operation in the engine's handlers triggered every iteration with ``pbar.log_message`` to guarantee the correct format of the stdout. @@ -122,7 +122,6 @@ def __init__( ] = "{desc}[{n_fmt}/{total_fmt}] {percentage:3.0f}%|{bar}{postfix} [{elapsed}<{remaining}]", **tqdm_kwargs: Any, ): - try: from tqdm.autonotebook import tqdm except ImportError: @@ -281,7 +280,6 @@ def get_max_number_events(event_name: Union[str, Events, CallableEventWithFilter return 1 def __call__(self, engine: Engine, logger: ProgressBar, event_name: Union[str, Events]) -> None: - pbar_total = self.get_max_number_events(event_name, engine) if logger.pbar is None: logger._reset(pbar_total=pbar_total) diff --git a/ignite/contrib/handlers/visdom_logger.py b/ignite/contrib/handlers/visdom_logger.py index ee2408cb54fe..31a09d8f782c 100644 --- a/ignite/contrib/handlers/visdom_logger.py +++ b/ignite/contrib/handlers/visdom_logger.py @@ -43,7 +43,7 @@ class VisdomLogger(BaseLogger): visdom server. Default, `num_workers=1`. If `num_workers=0` and logger uses the main thread. If using Python 2.7 and `num_workers>0` the package `futures` should be installed: `pip install futures` kwargs: kwargs to pass into - `visdom.Visdom `_. + `visdom.Visdom `_. Note: We can also specify username/password using environment variables: VISDOM_USERNAME, VISDOM_PASSWORD @@ -364,7 +364,6 @@ def __init__( _BaseVisDrawer.__init__(self, show_legend=show_legend) def __call__(self, engine: Engine, logger: VisdomLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, VisdomLogger): raise RuntimeError("Handler 'OutputHandler' works only with VisdomLogger") @@ -473,7 +472,6 @@ def __init__( _BaseVisDrawer.__init__(self, show_legend=show_legend) def __call__(self, engine: Engine, logger: VisdomLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, VisdomLogger): raise RuntimeError("Handler 'WeightsScalarHandler' works only with VisdomLogger") diff --git a/ignite/contrib/handlers/wandb_logger.py b/ignite/contrib/handlers/wandb_logger.py index 3670c65e8556..49417d1180f9 100644 --- a/ignite/contrib/handlers/wandb_logger.py +++ b/ignite/contrib/handlers/wandb_logger.py @@ -278,7 +278,6 @@ def __init__( self.sync = sync def __call__(self, engine: Engine, logger: WandBLogger, event_name: Union[str, Events]) -> None: - if not isinstance(logger, WandBLogger): raise RuntimeError(f"Handler '{self.__class__.__name__}' works only with WandBLogger.") diff --git a/ignite/contrib/metrics/average_precision.py b/ignite/contrib/metrics/average_precision.py index d206034c6ba6..5aae0848ddf3 100644 --- a/ignite/contrib/metrics/average_precision.py +++ b/ignite/contrib/metrics/average_precision.py @@ -68,7 +68,6 @@ def __init__( check_compute_fn: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ): - try: from sklearn.metrics import average_precision_score # noqa: F401 except ImportError: diff --git a/ignite/contrib/metrics/cohen_kappa.py b/ignite/contrib/metrics/cohen_kappa.py index 0cbb492b360b..942a394fb7e4 100644 --- a/ignite/contrib/metrics/cohen_kappa.py +++ b/ignite/contrib/metrics/cohen_kappa.py @@ -55,7 +55,6 @@ def __init__( check_compute_fn: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ): - try: from sklearn.metrics import cohen_kappa_score # noqa: F401 except ImportError: diff --git a/ignite/contrib/metrics/regression/canberra_metric.py b/ignite/contrib/metrics/regression/canberra_metric.py index 0ec011f87696..177e278e5646 100644 --- a/ignite/contrib/metrics/regression/canberra_metric.py +++ b/ignite/contrib/metrics/regression/canberra_metric.py @@ -63,6 +63,7 @@ class CanberraMetric(_BaseRegression): - Fixed implementation: ``abs`` in denominator. - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors",) @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/fractional_absolute_error.py b/ignite/contrib/metrics/regression/fractional_absolute_error.py index 80d88bf11d34..17934a133395 100644 --- a/ignite/contrib/metrics/regression/fractional_absolute_error.py +++ b/ignite/contrib/metrics/regression/fractional_absolute_error.py @@ -58,6 +58,7 @@ class FractionalAbsoluteError(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/fractional_bias.py b/ignite/contrib/metrics/regression/fractional_bias.py index d6516a8e1d43..9b6354f71628 100644 --- a/ignite/contrib/metrics/regression/fractional_bias.py +++ b/ignite/contrib/metrics/regression/fractional_bias.py @@ -58,6 +58,7 @@ class FractionalBias(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py b/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py index 50f9a4d70575..ee717b32df32 100644 --- a/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py +++ b/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py @@ -58,6 +58,7 @@ class GeometricMeanAbsoluteError(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py b/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py index 80d8b21d7424..79c377f29161 100644 --- a/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py +++ b/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py @@ -69,6 +69,7 @@ class GeometricMeanRelativeAbsoluteError(_BaseRegression): 0.0... """ + _state_dict_all_req_keys = ("_predictions", "_targets") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/manhattan_distance.py b/ignite/contrib/metrics/regression/manhattan_distance.py index 42c25944aebf..dae7a3acae11 100644 --- a/ignite/contrib/metrics/regression/manhattan_distance.py +++ b/ignite/contrib/metrics/regression/manhattan_distance.py @@ -59,6 +59,7 @@ class ManhattanDistance(_BaseRegression): - Fixed sklearn compatibility. - Workes with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors",) @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/maximum_absolute_error.py b/ignite/contrib/metrics/regression/maximum_absolute_error.py index 686599689d2a..f9c9a33550e5 100644 --- a/ignite/contrib/metrics/regression/maximum_absolute_error.py +++ b/ignite/contrib/metrics/regression/maximum_absolute_error.py @@ -58,6 +58,7 @@ class MaximumAbsoluteError(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_max_of_absolute_errors",) @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/mean_absolute_relative_error.py b/ignite/contrib/metrics/regression/mean_absolute_relative_error.py index 0d546da146e0..de0cc98c8c69 100644 --- a/ignite/contrib/metrics/regression/mean_absolute_relative_error.py +++ b/ignite/contrib/metrics/regression/mean_absolute_relative_error.py @@ -58,6 +58,7 @@ class MeanAbsoluteRelativeError(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_absolute_relative_errors", "_num_samples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/mean_error.py b/ignite/contrib/metrics/regression/mean_error.py index 50f0b26a56c0..96ed1a058856 100644 --- a/ignite/contrib/metrics/regression/mean_error.py +++ b/ignite/contrib/metrics/regression/mean_error.py @@ -55,6 +55,7 @@ class MeanError(_BaseRegression): 0.625... """ + _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/mean_normalized_bias.py b/ignite/contrib/metrics/regression/mean_normalized_bias.py index 14cebd682968..93f7ef4b9ec8 100644 --- a/ignite/contrib/metrics/regression/mean_normalized_bias.py +++ b/ignite/contrib/metrics/regression/mean_normalized_bias.py @@ -58,6 +58,7 @@ class MeanNormalizedBias(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/median_absolute_error.py b/ignite/contrib/metrics/regression/median_absolute_error.py index 46988bcc9e0c..d7f376a323bb 100644 --- a/ignite/contrib/metrics/regression/median_absolute_error.py +++ b/ignite/contrib/metrics/regression/median_absolute_error.py @@ -67,7 +67,6 @@ class MedianAbsoluteError(EpochMetric): def __init__( self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu") ): - super(MedianAbsoluteError, self).__init__( median_absolute_error_compute_fn, output_transform=output_transform, device=device ) diff --git a/ignite/contrib/metrics/regression/r2_score.py b/ignite/contrib/metrics/regression/r2_score.py index 185afe54cb30..d68cc616f9a0 100644 --- a/ignite/contrib/metrics/regression/r2_score.py +++ b/ignite/contrib/metrics/regression/r2_score.py @@ -56,6 +56,7 @@ class R2Score(_BaseRegression): .. versionchanged:: 0.4.3 Works with DDP. """ + _state_dict_all_req_keys = ("_num_examples", "_sum_of_errors", "_y_sq_sum", "_y_sum") @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/regression/wave_hedges_distance.py b/ignite/contrib/metrics/regression/wave_hedges_distance.py index 175aaf80532a..1e4475a7f17c 100644 --- a/ignite/contrib/metrics/regression/wave_hedges_distance.py +++ b/ignite/contrib/metrics/regression/wave_hedges_distance.py @@ -57,6 +57,7 @@ class WaveHedgesDistance(_BaseRegression): .. versionchanged:: 0.4.5 - Works with DDP. """ + _state_dict_all_req_keys = ("_sum_of_errors",) @reinit__is_reduced def reset(self) -> None: diff --git a/ignite/contrib/metrics/roc_auc.py b/ignite/contrib/metrics/roc_auc.py index b7e86e18b1a8..381e27158614 100644 --- a/ignite/contrib/metrics/roc_auc.py +++ b/ignite/contrib/metrics/roc_auc.py @@ -79,7 +79,6 @@ def __init__( check_compute_fn: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ): - try: from sklearn.metrics import roc_auc_score # noqa: F401 except ImportError: @@ -140,7 +139,7 @@ def sigmoid_output_transform(output): FPR [0.0, 0.333, 0.333, 1.0] TPR [0.0, 0.0, 1.0, 1.0] - Thresholds [2.0, 1.0, 0.711, 0.047] + Thresholds [inf, 1.0, 0.711, 0.047] .. versionchanged:: 0.4.11 added `device` argument @@ -152,7 +151,6 @@ def __init__( check_compute_fn: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ) -> None: - try: from sklearn.metrics import roc_curve # noqa: F401 except ImportError: diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 9eeacaa2f6d0..70d1950c633f 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -1,5 +1,5 @@ import warnings -from typing import Any, Callable, Iterator, List, Optional, Union +from typing import Any, Iterator, List, Optional, Union import torch import torch.nn as nn @@ -57,7 +57,8 @@ def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDe ) .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader - .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178 + .. _XLA MpDeviceLoader: + https://pytorch.org/xla/release/2.0/index.html#running-on-multiple-xla-devices-with-multi-processing .. _torch DistributedSampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler .. _torch IterableDataset: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset @@ -118,7 +119,6 @@ def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDe dataloader = DataLoader(dataset, **kwargs) if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and world_size > 1: - logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA") mp_device_loader_cls = _MpDeviceLoader @@ -256,7 +256,7 @@ def auto_optim(optimizer: Optimizer, **kwargs: Any) -> Optimizer: optimizer = idist.auto_optim(optimizer) - .. _xm.optimizer_step: http://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step + .. _xm.optimizer_step: https://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step .. versionchanged:: 0.4.2 Added Horovod distributed optimizer. @@ -294,7 +294,6 @@ class DistributedProxySampler(DistributedSampler): """ def __init__(self, sampler: Sampler, num_replicas: Optional[int] = None, rank: Optional[int] = None) -> None: - if not isinstance(sampler, Sampler): raise TypeError(f"Argument sampler should be instance of torch Sampler, but given: {type(sampler)}") @@ -329,7 +328,6 @@ def __iter__(self) -> Iterator: if idist.has_xla_support: - import torch_xla.core.xla_model as xm from torch_xla.distributed.parallel_loader import ParallelLoader @@ -353,5 +351,5 @@ def __init__(self, optimizer: Optimizer) -> None: super(self.__class__, self).__init__(optimizer.param_groups) # type: ignore[call-arg] self.wrapped_optimizer = optimizer - def step(self, closure: Optional[Callable] = None) -> None: + def step(self, closure: Any = None) -> Any: xm.optimizer_step(self.wrapped_optimizer, barrier=True) diff --git a/ignite/distributed/comp_models/__init__.py b/ignite/distributed/comp_models/__init__.py index 8f35f0b0e228..ef0576e8cdf8 100644 --- a/ignite/distributed/comp_models/__init__.py +++ b/ignite/distributed/comp_models/__init__.py @@ -11,9 +11,9 @@ from ignite.distributed.comp_models.xla import _XlaDistModel -def setup_available_computation_models() -> Tuple[ - Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]], ... -]: +def setup_available_computation_models() -> ( + Tuple[Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]], ...] +): models: List[Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]]] = [ _SerialModel, ] diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py index 7cd4061d5190..00d4383d1ac6 100644 --- a/ignite/distributed/comp_models/base.py +++ b/ignite/distributed/comp_models/base.py @@ -136,7 +136,6 @@ def _decode_as_placeholder(encoded_msg: List[int], device: torch.device) -> Unio def _setup_placeholder( self, x: Union[torch.Tensor, float, str, None], device: torch.device, is_src: bool ) -> Union[torch.Tensor, float, str]: - encoded_msg_per_rank = self._encode_input_data(x, is_src) encoded_msg_all_ranks = self._do_all_reduce(torch.tensor(encoded_msg_per_rank, device=device), op="MAX") @@ -182,13 +181,16 @@ def _apply_op( return tensor def _collective_op( - self, tensor: Union[torch.Tensor, float, str], fn: Callable, *args: Any, **kwargs: Any + self, tensor: Union[torch.Tensor, Number, str], fn: Callable, *args: Any, **kwargs: Any ) -> Union[torch.Tensor, float, List[float], List[str]]: tensor_to_number = tensor_to_str = False device = self.device() if isinstance(tensor, (Number, float)): tensor_to_number = True - tensor = torch.tensor(tensor, device=device, dtype=self._collective_op_dtype) + dtype = self._collective_op_dtype + if dtype is None and isinstance(tensor, float): + dtype = torch.double + tensor = torch.tensor(tensor, device=device, dtype=dtype) elif isinstance(tensor, str): tensor_to_str = True max_length = self._get_max_length(tensor, device) @@ -197,10 +199,7 @@ def _collective_op( tensor = self._apply_op(tensor, device, fn, *args, **kwargs) if tensor_to_number: - if tensor.numel() == 1: - return tensor.item() - else: - return tensor.tolist() + return tensor.tolist() elif tensor_to_str: return self._decode_str(tensor) return tensor @@ -214,10 +213,10 @@ def all_reduce( return cast(Union[torch.Tensor, float], self._collective_op(tensor, self._do_all_reduce, op, group=group)) def all_gather( - self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None - ) -> Union[torch.Tensor, float, List[float], List[str]]: + self, tensor: Union[torch.Tensor, float, str, Any], group: Optional[Any] = None + ) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]: if not isinstance(tensor, (torch.Tensor, Number, str)): - raise TypeError(f"Unhandled input type {type(tensor)}") + return self._do_all_gather_object(tensor, group=group) return self._collective_op(tensor, self._do_all_gather, group=group) @@ -280,6 +279,10 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor: pass + @abstractmethod + def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]: + pass + @abstractmethod def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor: pass @@ -352,11 +355,11 @@ def all_reduce( return tensor def all_gather( - self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None - ) -> Union[torch.Tensor, float, List[float], List[str]]: + self, tensor: Union[torch.Tensor, float, str, Any], group: Optional[Any] = None + ) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]: if isinstance(tensor, torch.Tensor): return tensor - return cast(Union[List[float], List[str]], [tensor]) + return cast(Union[List[float], List[str], List[Any]], [tensor]) def broadcast( self, tensor: Union[torch.Tensor, float, str, None], src: int = 0, safe_mode: bool = False @@ -371,6 +374,9 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor: return tensor + def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> Any: + return tensor + def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any: return ranks diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py index 3a6226a12bda..36f15f4428db 100644 --- a/ignite/distributed/comp_models/horovod.py +++ b/ignite/distributed/comp_models/horovod.py @@ -21,7 +21,6 @@ if has_hvd_support: - HOROVOD = "horovod" class _HorovodDistModel(ComputationModel): @@ -193,6 +192,12 @@ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> t tensor = tensor.unsqueeze(0) return hvd.allgather(tensor) + def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]: + if group is not None: + raise NotImplementedError("all_gather with group for horovod is not implemented") + + return hvd.allgather_object(tensor) + def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any: return hvd.ProcessSet(ranks) diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index e0f63cabbaa8..c71c7d423119 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -15,7 +15,6 @@ if has_native_dist_support: - NCCL = dist.Backend.NCCL GLOO = dist.Backend.GLOO MPI = dist.Backend.MPI @@ -196,7 +195,6 @@ def _compute_local_rank_via_hostname(self) -> int: return local_rank def _identify_local_rank(self) -> None: - if "SLURM_JOB_ID" in os.environ: os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"] @@ -216,7 +214,6 @@ def _identify_local_rank(self) -> None: self._local_rank = self._compute_local_rank_via_hostname() def setup_env_vars(self, rank: Optional[int] = None, world_size: Optional[int] = None) -> None: - self._env_backup = os.environ.copy() if "SLURM_JOB_ID" in os.environ: @@ -426,6 +423,7 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[ if group is not None and not isinstance(group, dist.ProcessGroup): raise ValueError("Argument group should be list of int or ProcessGroup") reduce_op = self._reduce_op_map[op] + # We do if/else here for compatibility with older pytorch versions if group is not None: dist.all_reduce(tensor, reduce_op, group=group) else: @@ -433,17 +431,50 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[ return tensor def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor: - if group is not None and not isinstance(group, dist.ProcessGroup): + if group == dist.GroupMember.NON_GROUP_MEMBER: + return tensor + + if group is None: + group_size = self.get_world_size() + elif isinstance(group, dist.ProcessGroup): + group_size = group.size() + else: raise ValueError("Argument group should be list of int or ProcessGroup") if tensor.ndimension() == 0: tensor = tensor.unsqueeze(0) - output = [torch.zeros_like(tensor) for _ in range(self.get_world_size())] + output = [torch.zeros_like(tensor) for _ in range(group_size)] + # We do if/else here for compatibility with older pytorch versions if group is not None: dist.all_gather(output, tensor, group=group) else: dist.all_gather(output, tensor) return torch.cat(output, dim=0) + def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]: + if Version(torch.__version__) < Version("1.7.0"): + raise RuntimeError( + "Current torch version does not implement dist.all_gather_object. " + "Required version should be >=1.7.0" + ) + + if group == dist.GroupMember.NON_GROUP_MEMBER: + return tensor + + if group is None: + group_size = self.get_world_size() + elif isinstance(group, dist.ProcessGroup): + group_size = group.size() + else: + raise ValueError("Argument group should be list of int or ProcessGroup") + output = [None for _ in range(group_size)] + # We do if/else here for compatibility with older pytorch versions + if group is not None: + dist.all_gather_object(output, tensor, group=group) + else: + dist.all_gather_object(output, tensor) + + return output + def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any: return dist.new_group(ranks=ranks, **kwargs) @@ -475,7 +506,6 @@ def _expand_hostlist(nodelist: str) -> List[str]: nodelist = nodelist.replace(" ", "") for node in re.findall(nodelist_match, nodelist): - node_match = r"(.+)\[((,?[0-9]+-?,?-?){0,})\](.*)?" match = re.search(node_match, node) diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py index c6f6a68a5a1c..eaaeceb02520 100644 --- a/ignite/distributed/comp_models/xla.py +++ b/ignite/distributed/comp_models/xla.py @@ -15,7 +15,6 @@ if has_xla_support: - XLA_TPU = "xla-tpu" class _XlaDistModel(ComputationModel): @@ -156,6 +155,9 @@ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> t xm.all_reduce("sum", [output], groups=group) return output.reshape(-1, *output.shape[2:]) + def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]: + raise NotImplementedError("all_gather on object is not implemented for xla") + def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any: return [ranks] diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py index 0d885b467917..0249a5150d3a 100644 --- a/ignite/distributed/utils.py +++ b/ignite/distributed/utils.py @@ -1,4 +1,5 @@ import socket +from contextlib import contextmanager from functools import wraps from typing import Any, Callable, List, Mapping, Optional, Tuple, Union @@ -41,6 +42,7 @@ "registered_computation_models", "one_rank_only", "new_group", + "one_rank_first", ] _model = _SerialModel() @@ -303,7 +305,7 @@ def train_fn(local_rank, a, b, c, d=12): .. _dist.init_process_group: https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group .. _mp.start_processes: https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn - .. _xmp.spawn: http://pytorch.org/xla/release/1.6/index.html#torch_xla.distributed.xla_multiprocessing.spawn + .. _xmp.spawn: https://pytorch.org/xla/release/1.6/index.html#torch_xla.distributed.xla_multiprocessing.spawn .. _hvd_run: https://horovod.readthedocs.io/en/latest/api.html#module-horovod.run .. versionchanged:: 0.4.2 @@ -349,18 +351,20 @@ def all_reduce( def all_gather( - tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None -) -> Union[torch.Tensor, float, List[float], List[str]]: + tensor: Union[torch.Tensor, float, str, Any], group: Optional[Union[Any, List[int]]] = None +) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]: """Helper method to perform all gather operation. Args: - tensor: tensor or number or str to collect across participating processes. + tensor: tensor or number or str to collect across participating processes. If tensor, it should have the + same shape across processes. group: list of integer or the process group for each backend. If None, the default process group will be used. Returns: - torch.Tensor of shape ``(world_size * tensor.shape[0], tensor.shape[1], ...)`` if input is a tensor or - torch.Tensor of shape ``(world_size, )`` if input is a number or - List of strings if input is a string + If input is a tensor, returns a torch.Tensor of shape ``(world_size * tensor.shape[0], tensor.shape[1], ...)``. + If input is a number, a torch.Tensor of shape ``(world_size, )`` is returned and finally a list of strings + is returned if input is a string. If current process does not belong to `group`, the very ``tensor`` is + returned. .. versionchanged:: 0.4.11 added ``group`` @@ -635,3 +639,44 @@ def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]: return wrapper return _one_rank_only + + +@contextmanager +def one_rank_first(rank: int = 0, local: bool = False) -> Any: + """Context manager that ensures a specific rank runs first before others in a distributed + environment. + + Args: + rank: rank of the process that should execute the code + block inside the context manager first. Default, 0. + local: flag to specify local rank or global rank. + If True ``rank`` argument will define a local rank to run first. + Default, False + + Examples: + .. code-block:: python + + def download_dataset(): + ... + + with idist.one_rank_first(): + ds = download_dataset() + + dp = ds[0] + + .. versionadded:: 0.4.13 + """ + + current_rank = get_local_rank() if local else get_rank() + size = get_nproc_per_node() if local else get_world_size() + + if rank >= size or rank < 0: + raise ValueError(f"rank should be between 0 and {size - 1}, but given {rank}") + + if current_rank != rank: + barrier() + + yield + + if current_rank == rank: + barrier() diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index 299afadba9a2..60d6f7690b2e 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -44,20 +44,21 @@ def _prepare_batch( def supervised_training_step( model: torch.nn.Module, optimizer: torch.optim.Optimizer, - loss_fn: Union[Callable, torch.nn.Module], + loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module], device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), gradient_accumulation_steps: int = 1, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """Factory function for supervised training. Args: model: the model to train. optimizer: the optimizer to use. - loss_fn: the loss function to use. + loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor. device: device type specification (default: None). Applies to batches after starting the engine. Model *will not* be moved. Device can be CPU, GPU. @@ -71,6 +72,8 @@ def supervised_training_step( to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`. gradient_accumulation_steps: Number of steps the gradients should be accumulated across. (default: 1 (means no gradient accumulation)) + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. + Returns: Callable: update function. @@ -91,6 +94,8 @@ def supervised_training_step( Added Gradient Accumulation. .. versionchanged:: 0.4.11 Added `model_transform` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ if gradient_accumulation_steps <= 0: @@ -104,7 +109,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to optimizer.zero_grad() model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) if gradient_accumulation_steps > 1: @@ -120,7 +125,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to def supervised_training_step_amp( model: torch.nn.Module, optimizer: torch.optim.Optimizer, - loss_fn: Union[Callable, torch.nn.Module], + loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module], device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, @@ -128,13 +133,14 @@ def supervised_training_step_amp( output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), scaler: Optional["torch.cuda.amp.GradScaler"] = None, gradient_accumulation_steps: int = 1, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """Factory function for supervised training using ``torch.cuda.amp``. Args: model: the model to train. optimizer: the optimizer to use. - loss_fn: the loss function to use. + loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor. device: device type specification (default: None). Applies to batches after starting the engine. Model *will not* be moved. Device can be CPU, GPU. @@ -149,6 +155,7 @@ def supervised_training_step_amp( scaler: GradScaler instance for gradient scaling. (default: None) gradient_accumulation_steps: Number of steps the gradients should be accumulated across. (default: 1 (means no gradient accumulation)) + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: Callable: update function @@ -171,6 +178,8 @@ def supervised_training_step_amp( Added Gradient Accumulation. .. versionchanged:: 0.4.11 Added `model_transform` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ try: @@ -190,7 +199,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) with autocast(enabled=True): - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) if gradient_accumulation_steps > 1: @@ -212,20 +221,21 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to def supervised_training_step_apex( model: torch.nn.Module, optimizer: torch.optim.Optimizer, - loss_fn: Union[Callable, torch.nn.Module], + loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module], device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), gradient_accumulation_steps: int = 1, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """Factory function for supervised training using apex. Args: model: the model to train. optimizer: the optimizer to use. - loss_fn: the loss function to use. + loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor. device: device type specification (default: None). Applies to batches after starting the engine. Model *will not* be moved. Device can be CPU, GPU. @@ -239,6 +249,7 @@ def supervised_training_step_apex( to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`. gradient_accumulation_steps: Number of steps the gradients should be accumulated across. (default: 1 (means no gradient accumulation)) + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: Callable: update function. @@ -260,6 +271,8 @@ def supervised_training_step_apex( Added Gradient Accumulation. .. versionchanged:: 0.4.11 Added `model_transform` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ try: @@ -278,7 +291,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to optimizer.zero_grad() model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) if gradient_accumulation_steps > 1: @@ -295,20 +308,21 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to def supervised_training_step_tpu( model: torch.nn.Module, optimizer: torch.optim.Optimizer, - loss_fn: Union[Callable, torch.nn.Module], + loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module], device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(), gradient_accumulation_steps: int = 1, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """Factory function for supervised training using ``torch_xla``. Args: model: the model to train. optimizer: the optimizer to use. - loss_fn: the loss function to use. + loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor. device: device type specification (default: None). Applies to batches after starting the engine. Model *will not* be moved. Device can be CPU, TPU. @@ -322,6 +336,7 @@ def supervised_training_step_tpu( to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`. gradient_accumulation_steps: Number of steps the gradients should be accumulated across. (default: 1 (means no gradient accumulation)) + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: Callable: update function. @@ -343,6 +358,8 @@ def supervised_training_step_tpu( Added Gradient Accumulation argument for all supervised training methods. .. versionchanged:: 0.4.11 Added `model_transform` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ try: import torch_xla.core.xla_model as xm @@ -360,7 +377,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to optimizer.zero_grad() model.train() x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) loss = loss_fn(y_pred, y) if gradient_accumulation_steps > 1: @@ -404,7 +421,7 @@ def _check_arg( def create_supervised_trainer( model: torch.nn.Module, optimizer: torch.optim.Optimizer, - loss_fn: Union[Callable, torch.nn.Module], + loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module], device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, @@ -414,13 +431,14 @@ def create_supervised_trainer( amp_mode: Optional[str] = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, gradient_accumulation_steps: int = 1, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Engine: """Factory function for creating a trainer for supervised models. Args: model: the model to train. optimizer: the optimizer to use. - loss_fn: the loss function to use. + loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor. device: device type specification (default: None). Applies to batches after starting the engine. Model *will not* be moved. Device can be CPU, GPU or TPU. @@ -444,6 +462,7 @@ def create_supervised_trainer( (default: False) gradient_accumulation_steps: Number of steps the gradients should be accumulated across. (default: 1 (means no gradient accumulation)) + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: a trainer engine with supervised update function. @@ -525,6 +544,8 @@ def output_transform_fn(x, y, y_pred, loss): Added Gradient Accumulation argument for all supervised training methods. .. versionchanged:: 0.4.11 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ device_type = device.type if isinstance(device, torch.device) else device @@ -543,6 +564,7 @@ def output_transform_fn(x, y, y_pred, loss): output_transform, _scaler, gradient_accumulation_steps, + model_fn, ) elif mode == "apex": _update = supervised_training_step_apex( @@ -555,6 +577,7 @@ def output_transform_fn(x, y, y_pred, loss): model_transform, output_transform, gradient_accumulation_steps, + model_fn, ) elif mode == "tpu": _update = supervised_training_step_tpu( @@ -567,6 +590,7 @@ def output_transform_fn(x, y, y_pred, loss): model_transform, output_transform, gradient_accumulation_steps, + model_fn, ) else: _update = supervised_training_step( @@ -579,6 +603,7 @@ def output_transform_fn(x, y, y_pred, loss): model_transform, output_transform, gradient_accumulation_steps, + model_fn, ) trainer = Engine(_update) if not deterministic else DeterministicEngine(_update) @@ -595,6 +620,7 @@ def supervised_evaluation_step( prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y), + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """ Factory function for supervised evaluation. @@ -612,6 +638,7 @@ def supervised_evaluation_step( output_transform: function that receives 'x', 'y', 'y_pred' and returns value to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits output expected by metrics. If you change it you should use `output_transform` in metrics. + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: Inference function. @@ -629,13 +656,15 @@ def supervised_evaluation_step( .. versionadded:: 0.4.5 .. versionchanged:: 0.4.12 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]: model.eval() with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) return output_transform(x, y, y_pred) @@ -649,6 +678,7 @@ def supervised_evaluation_step_amp( prepare_batch: Callable = _prepare_batch, model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y), + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Callable: """ Factory function for supervised evaluation using ``torch.cuda.amp``. @@ -666,6 +696,7 @@ def supervised_evaluation_step_amp( output_transform: function that receives 'x', 'y', 'y_pred' and returns value to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits output expected by metrics. If you change it you should use `output_transform` in metrics. + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: Inference function. @@ -683,6 +714,8 @@ def supervised_evaluation_step_amp( .. versionadded:: 0.4.5 .. versionchanged:: 0.4.12 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ try: from torch.cuda.amp import autocast @@ -694,7 +727,7 @@ def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, T with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) with autocast(enabled=True): - output = model(x) + output = model_fn(model, x) y_pred = model_transform(output) return output_transform(x, y, y_pred) @@ -710,6 +743,7 @@ def create_supervised_evaluator( model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y), amp_mode: Optional[str] = None, + model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x), ) -> Engine: """ Factory function for creating an evaluator for supervised models. @@ -730,6 +764,7 @@ def create_supervised_evaluator( output expected by metrics. If you change it you should use `output_transform` in metrics. amp_mode: can be ``amp``, model will be casted to float16 using `torch.cuda.amp `_ + model_fn: the model function that receives `model` and `x`, and returns `y_pred`. Returns: an evaluator engine with supervised inference function. @@ -754,6 +789,8 @@ def create_supervised_evaluator( Added ``amp_mode`` argument for automatic mixed precision. .. versionchanged:: 0.4.12 Added ``model_transform`` to transform model's output + .. versionchanged:: 0.4.13 + Added `model_fn` to customize model's application on the sample """ device_type = device.type if isinstance(device, torch.device) else device on_tpu = "xla" in device_type if device_type is not None else False @@ -768,6 +805,7 @@ def create_supervised_evaluator( prepare_batch=prepare_batch, model_transform=model_transform, output_transform=output_transform, + model_fn=model_fn, ) else: evaluate_step = supervised_evaluation_step( @@ -777,6 +815,7 @@ def create_supervised_evaluator( prepare_batch=prepare_batch, model_transform=model_transform, output_transform=output_transform, + model_fn=model_fn, ) evaluator = Engine(evaluate_step) diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py index 5e7e8c798217..5e4881e6d921 100644 --- a/ignite/engine/engine.py +++ b/ignite/engine/engine.py @@ -1,6 +1,5 @@ import functools import logging -import math import time import warnings import weakref @@ -40,8 +39,14 @@ class Engine(Serializable): .. code-block:: python - def update_model(engine, batch): - inputs, targets = batch + model = ... + model = model.cuda() + optimized = ... + criterion = ... + + def train_step(engine, batch): + model.train() + inputs, targets = batch[0].cuda(), batch[1].cuda() optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) @@ -643,7 +648,7 @@ def state_dict_user_keys(self) -> List: return self._state_dict_user_keys def state_dict(self) -> OrderedDict: - """Returns a dictionary containing engine's state: "epoch_length", "max_epochs" and "iteration" and + """Returns a dictionary containing engine's state: "seed", "epoch_length", "max_epochs" and "iteration" and other state values defined by `engine.state_dict_user_keys` .. code-block:: python @@ -676,11 +681,11 @@ def save_engine(_): def load_state_dict(self, state_dict: Mapping) -> None: """Setups engine from `state_dict`. - State dictionary should contain keys: `iteration` or `epoch`, `max_epochs` and `epoch_length`. - If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary. + State dictionary should contain keys: `iteration` or `epoch` and `max_epochs`, `epoch_length` and + `seed`. If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary. Iteration and epoch values are 0-based: the first iteration or epoch is zero. - This method does not remove any custom attributes added by user. + This method does not remove any custom attributs added by user. Args: state_dict: a dict with parameters @@ -725,14 +730,13 @@ def load_state_dict(self, state_dict: Mapping) -> None: @staticmethod def _is_done(state: State) -> bool: - is_done_iters = state.max_iters is not None and state.iteration >= state.max_iters is_done_count = ( state.epoch_length is not None and state.max_epochs is not None and state.iteration >= state.epoch_length * state.max_epochs ) is_done_epochs = state.max_epochs is not None and state.epoch >= state.max_epochs - return is_done_iters or is_done_count or is_done_epochs + return is_done_count or is_done_epochs def set_data(self, data: Union[Iterable, DataLoader]) -> None: """Method to set data. After calling the method the next batch passed to `processing_function` is @@ -774,14 +778,14 @@ def run( self, data: Optional[Iterable] = None, max_epochs: Optional[int] = None, - max_iters: Optional[int] = None, epoch_length: Optional[int] = None, + seed: Optional[int] = None, ) -> State: """Runs the ``process_function`` over the passed data. Engine has a state and the following logic is applied in this function: - - At the first call, new state is defined by `max_epochs`, `max_iters`, `epoch_length`, if provided. + - At the first call, new state is defined by `max_epochs`, `epoch_length`, `seed`, if provided. A timer for total and per-epoch time is initialized when Events.STARTED is handled. - If state is already defined such that there are iterations to run until `max_epochs` and no input arguments provided, state is kept and used in the function. @@ -799,8 +803,7 @@ def run( `len(data)`. If `data` is an iterator and `epoch_length` is not set, then it will be automatically determined as the iteration on which data iterator raises `StopIteration`. This argument should not change if run is resuming from a state. - max_iters: Number of iterations to run for. - `max_iters` and `max_epochs` are mutually exclusive; only one of the two arguments should be provided. + seed: Deprecated argument. Please, use `torch.manual_seed` or :meth:`~ignite.utils.manual_seed`. Returns: State: output state. @@ -829,6 +832,12 @@ def switch_batch(engine): trainer.run(train_loader, max_epochs=2) """ + if seed is not None: + warnings.warn( + "Argument seed is deprecated. It will be removed in 0.4.14. " + "Please, use torch.manual_seed or ignite.utils.manual_seed" + ) + if data is not None and not isinstance(data, Iterable): raise TypeError("Argument data should be iterable") @@ -852,6 +861,8 @@ def switch_batch(engine): if self.state.max_epochs is None or (self._is_done(self.state) and self._internal_run_generator is None): # Create new state + if max_epochs is None: + max_epochs = 1 if epoch_length is None: if data is None: raise ValueError("epoch_length should be provided if data is None") @@ -860,22 +871,9 @@ def switch_batch(engine): if epoch_length is not None and epoch_length < 1: raise ValueError("Input data has zero size. Please provide non-empty data") - if max_iters is None: - if max_epochs is None: - max_epochs = 1 - else: - if max_epochs is not None: - raise ValueError( - "Arguments max_iters and max_epochs are mutually exclusive." - "Please provide only max_epochs or max_iters." - ) - if epoch_length is not None: - max_epochs = math.ceil(max_iters / epoch_length) - self.state.iteration = 0 self.state.epoch = 0 self.state.max_epochs = max_epochs - self.state.max_iters = max_iters self.state.epoch_length = epoch_length # Reset generator if previously used self._internal_run_generator = None @@ -1048,19 +1046,12 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: if self.state.epoch_length is None: # Define epoch length and stop the epoch self.state.epoch_length = iter_counter - if self.state.max_iters is not None: - self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length) break # Should exit while loop if we can not iterate if should_exit: - if not self._is_done(self.state): - total_iters = ( - self.state.epoch_length * self.state.max_epochs - if self.state.max_epochs is not None - else self.state.max_iters - ) - + if not self._is_done(self.state) and self.state.max_epochs is not None: + total_iters = self.state.epoch_length * self.state.max_epochs warnings.warn( "Data iterator can not provide data anymore but required total number of " "iterations to run is not reached. " @@ -1087,10 +1078,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: if self.state.epoch_length is not None and iter_counter == self.state.epoch_length: break - if self.state.max_iters is not None and self.state.iteration == self.state.max_iters: - self.should_terminate = True - raise _EngineTerminateException() - except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) self.should_terminate_single_epoch = False @@ -1206,19 +1193,12 @@ def _run_once_on_dataset_legacy(self) -> float: if self.state.epoch_length is None: # Define epoch length and stop the epoch self.state.epoch_length = iter_counter - if self.state.max_iters is not None: - self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length) break # Should exit while loop if we can not iterate if should_exit: - if not self._is_done(self.state): - total_iters = ( - self.state.epoch_length * self.state.max_epochs - if self.state.max_epochs is not None - else self.state.max_iters - ) - + if not self._is_done(self.state) and self.state.max_epochs is not None: + total_iters = self.state.epoch_length * self.state.max_epochs warnings.warn( "Data iterator can not provide data anymore but required total number of " "iterations to run is not reached. " @@ -1245,10 +1225,6 @@ def _run_once_on_dataset_legacy(self) -> float: if self.state.epoch_length is not None and iter_counter == self.state.epoch_length: break - if self.state.max_iters is not None and self.state.iteration == self.state.max_iters: - self.should_terminate = True - raise _EngineTerminateException() - except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) self.should_terminate_single_epoch = False diff --git a/ignite/engine/events.py b/ignite/engine/events.py index 9dd99348492b..c41d79468f34 100644 --- a/ignite/engine/events.py +++ b/ignite/engine/events.py @@ -203,6 +203,17 @@ def __or__(self, other: Any) -> "EventsList": return EventsList() | self | other +class CallableEvents(CallableEventWithFilter): + # For backward compatibility + def __init__(self, *args: Any, **kwargs: Any) -> None: + super(CallableEvents, self).__init__(*args, **kwargs) + warnings.warn( + "Class ignite.engine.events.CallableEvents is deprecated. It will be removed in 0.4.14. " + "Please, use ignite.engine.EventEnum instead", + DeprecationWarning, + ) + + class EventEnum(CallableEventWithFilter, Enum): """Base class for all :class:`~ignite.engine.events.Events`. User defined custom events should also inherit this class. @@ -443,7 +454,6 @@ class State: state.dataloader # data passed to engine state.epoch_length # optional length of an epoch state.max_epochs # number of epochs to run - state.max_iters # number of iterations to run state.batch # batch passed to `process_function` state.output # output of `process_function` after a single iteration state.metrics # dictionary with defined metrics if any @@ -470,7 +480,6 @@ def __init__(self, **kwargs: Any) -> None: self.epoch = 0 self.epoch_length: Optional[int] = None self.max_epochs: Optional[int] = None - self.max_iters: Optional[int] = None self.output: Optional[int] = None self.batch: Optional[int] = None self.metrics: Dict[str, Any] = {} diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index f508f0170220..901810516c51 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -7,7 +7,7 @@ from abc import ABCMeta, abstractmethod from collections import OrderedDict from pathlib import Path -from typing import Any, Callable, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union +from typing import Any, Callable, cast, Dict, List, Mapping, NamedTuple, Optional, Union import torch import torch.nn as nn @@ -23,6 +23,7 @@ import ignite.distributed as idist from ignite.base import Serializable from ignite.engine import Engine, Events +from ignite.utils import _tree_apply2, _tree_map __all__ = ["Checkpoint", "DiskSaver", "ModelCheckpoint", "BaseSaveHandler"] @@ -82,7 +83,7 @@ class Checkpoint(Serializable): ``load_state_dict`` methods. If contains objects of type torch `DistributedDataParallel`_ or `DataParallel`_, their internal wrapped model is automatically saved (to avoid additional key ``module.`` in the state dictionary). - save_handler: String, method or callable class + save_handler: String, function or callable object. used to save engine and other provided objects. Function receives two objects: checkpoint as a dictionary and filename. If ``save_handler`` is callable class, it can inherit of :class:`~ignite.handlers.checkpoint.BaseSaveHandler` and optionally implement ``remove`` method @@ -102,6 +103,7 @@ class Checkpoint(Serializable): Input of the function is ``(engine, event_name)``. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. + archived: Deprecated argument as models saved by ``torch.save`` are already compressed. filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render checkpoint filenames. If the pattern is not defined, the default pattern would be used. See Note for details. @@ -277,7 +279,7 @@ class Checkpoint(Serializable): """ Item = NamedTuple("Item", [("priority", int), ("filename", str)]) - _state_dict_all_req_keys = ("saved",) + _state_dict_all_req_keys = ("_saved",) def __init__( self, @@ -288,12 +290,12 @@ def __init__( score_name: Optional[str] = None, n_saved: Union[int, None] = 1, global_step_transform: Optional[Callable] = None, + archived: bool = False, filename_pattern: Optional[str] = None, include_self: bool = False, greater_or_equal: bool = False, save_on_rank: int = 0, ): - if not isinstance(to_save, collections.Mapping): raise TypeError(f"Argument `to_save` should be a dictionary, but given {type(to_save)}") @@ -320,6 +322,8 @@ def __init__( if global_step_transform is not None and not callable(global_step_transform): raise TypeError(f"global_step_transform should be a function, got {type(global_step_transform)} instead.") + if archived: + warnings.warn("Argument archived is deprecated and will be removed in 0.4.14") self.to_save = to_save self.filename_prefix = filename_prefix @@ -401,7 +405,6 @@ def _compare_fn(self, new: Union[int, float]) -> bool: return new > self._saved[0].priority def __call__(self, engine: Engine) -> None: - global_step = None if self.global_step_transform is not None: global_step = self.global_step_transform(engine, engine.last_event_name) @@ -416,7 +419,6 @@ def __call__(self, engine: Engine) -> None: priority = global_step if self._check_lt_n_saved() or self._compare_fn(priority): - priority_str = f"{priority}" if isinstance(priority, numbers.Integral) else f"{priority:.4f}" checkpoint = self._setup_checkpoint() @@ -469,18 +471,20 @@ def __call__(self, engine: Engine) -> None: except TypeError: self.save_handler(checkpoint, filename) - def _setup_checkpoint(self) -> Dict[str, Dict[Any, Any]]: - checkpoint = {} + def _setup_checkpoint(self) -> Dict[str, Any]: if self.to_save is not None: - for k, obj in self.to_save.items(): + + def func(obj: Any, **kwargs: Any) -> Dict: if isinstance(obj, (nn.DataParallel, nn.parallel.DistributedDataParallel)): obj = obj.module elif HAVE_ZERO and isinstance(obj, ZeroRedundancyOptimizer): obj.consolidate_state_dict(to=self.save_on_rank) if self.save_on_rank != idist.get_rank(): - continue - checkpoint[k] = obj.state_dict() - return checkpoint + return {} + return obj.state_dict() + + return cast(Dict[str, Any], _tree_map(func, self.to_save)) + return {} @staticmethod def setup_filename_pattern( @@ -535,10 +539,12 @@ def setup_filename_pattern( @staticmethod def _check_objects(objs: Mapping, attr: str) -> None: - for k, obj in objs.items(): + def func(obj: Any, **kwargs: Any) -> None: if not hasattr(obj, attr): raise TypeError(f"Object {type(obj)} should have `{attr}` method") + _tree_map(func, objs) + @staticmethod def load_objects(to_load: Mapping, checkpoint: Union[str, Mapping, Path], **kwargs: Any) -> None: """Helper method to apply ``load_state_dict`` on the objects from ``to_load`` using states from ``checkpoint``. @@ -594,26 +600,22 @@ def load_objects(to_load: Mapping, checkpoint: Union[str, Mapping, Path], **kwar torch.nn.parallel.DistributedDataParallel.html .. _DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html """ + if not isinstance(checkpoint, (collections.Mapping, str, Path)): + raise TypeError(f"Argument checkpoint should be a string or a dictionary, but given {type(checkpoint)}") + + Checkpoint._check_objects(to_load, "load_state_dict") if isinstance(checkpoint, (str, Path)): checkpoint_obj = torch.load(checkpoint) else: checkpoint_obj = checkpoint - Checkpoint._check_objects(to_load, "load_state_dict") - if not isinstance(checkpoint, (collections.Mapping, str, Path)): - raise TypeError(f"Argument checkpoint should be a string or a dictionary, but given {type(checkpoint)}") - - if len(kwargs) > 1 or any(k for k in kwargs if k not in ["strict"]): - warnings.warn("kwargs contains keys other than strict and these will be ignored") - - is_state_dict_strict = kwargs.get("strict", True) - def _load_object(obj: Any, chkpt_obj: Any) -> None: if isinstance(obj, (nn.DataParallel, nn.parallel.DistributedDataParallel)): obj = obj.module + if isinstance(obj, torch.nn.Module): - obj.load_state_dict(chkpt_obj, strict=is_state_dict_strict) + obj.load_state_dict(chkpt_obj, **kwargs) else: obj.load_state_dict(chkpt_obj) @@ -624,11 +626,7 @@ def _load_object(obj: Any, chkpt_obj: Any) -> None: _load_object(obj, checkpoint_obj) return - # multiple objects to load - for k, obj in to_load.items(): - if k not in checkpoint_obj: - raise ValueError(f"Object labeled by '{k}' from `to_load` is not found in the checkpoint") - _load_object(obj, checkpoint_obj[k]) + _tree_apply2(_load_object, to_load, checkpoint_obj) def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, **filename_components: Any) -> None: """Helper method to apply ``load_state_dict`` on the objects from ``to_load``. Filename components such as @@ -672,10 +670,18 @@ def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, * If ``to_load`` contains objects of type torch `DistributedDataParallel`_ or `DataParallel`_, method ``load_state_dict`` will applied to their internal wrapped model (``obj.module``). + Note: + This method works only when the ``save_handler`` is of types string, + :class:`~pathlib.Path` or :class:`~ignite.handlers.checkpoint.DiskSaver`. + .. _DistributedDataParallel: https://pytorch.org/docs/stable/generated/ torch.nn.parallel.DistributedDataParallel.html .. _DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html """ + if not isinstance(self.save_handler, DiskSaver): + raise AttributeError( + f"Checkpoint's `save_handler` should be of type `DiskSaver`, given {type(self.save_handler)}" + ) global_step = filename_components.get("global_step", None) @@ -706,20 +712,21 @@ def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, * Checkpoint.load_objects(to_load=to_load, checkpoint=path, **load_kwargs) - def state_dict(self) -> "OrderedDict[str, List[Tuple[int, str]]]": + def state_dict(self) -> OrderedDict: """Method returns state dict with saved items: list of ``(priority, filename)`` pairs. Can be used to save internal state of the class. """ - return OrderedDict([("saved", [(p, f) for p, f in self._saved])]) + # TODO: this method should use _state_dict_all_req_keys + return OrderedDict([("_saved", [(p, f) for p, f in self._saved])]) def load_state_dict(self, state_dict: Mapping) -> None: - """Method replace internal state of the class with provided state dict data. + """Method replaces internal state of the class with provided state dict data. Args: state_dict: a dict with "saved" key and list of ``(priority, filename)`` pairs as values. """ super().load_state_dict(state_dict) - self._saved = [Checkpoint.Item(p, f) for p, f in state_dict["saved"]] + self._saved = [Checkpoint.Item(p, f) for p, f in state_dict["_saved"]] @staticmethod def get_default_score_fn(metric_name: str, score_sign: float = 1.0) -> Callable: @@ -878,6 +885,11 @@ class ModelCheckpoint(Checkpoint): Behaviour of this class has been changed since v0.3.0. + Argument ``save_as_state_dict`` is deprecated and should not be used. It is considered as True. + + Argument ``save_interval`` is deprecated and should not be used. Please, use events filtering instead, e.g. + ``Events.ITERATION_STARTED(every=1000)``. + There is no more internal counter that has been used to indicate the number of save actions. User could see its value `step_number` in the filename, e.g. `{filename_prefix}_{name}_{step_number}.pt`. Actually, `step_number` is replaced by current engine's epoch if `score_function` is specified and current iteration @@ -906,6 +918,7 @@ class ModelCheckpoint(Checkpoint): Input of the function is `(engine, event_name)`. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. + archived: Deprecated argument as models saved by `torch.save` are already compressed. filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render checkpoint filenames. If the pattern is not defined, the default pattern would be used. See :class:`~ignite.handlers.checkpoint.Checkpoint` for details. @@ -952,19 +965,38 @@ def __init__( self, dirname: Union[str, Path], filename_prefix: str = "", + save_interval: Optional[int] = None, score_function: Optional[Callable] = None, score_name: Optional[str] = None, n_saved: Union[int, None] = 1, atomic: bool = True, require_empty: bool = True, create_dir: bool = True, + save_as_state_dict: bool = True, global_step_transform: Optional[Callable] = None, + archived: bool = False, filename_pattern: Optional[str] = None, include_self: bool = False, greater_or_equal: bool = False, save_on_rank: int = 0, **kwargs: Any, ): + if not save_as_state_dict: + raise ValueError( + "Argument save_as_state_dict is deprecated and should be True." + "This argument will be removed in 0.4.14." + ) + if save_interval is not None: + msg = ( + "Argument save_interval is deprecated and should be None. This argument will be removed in 0.4.14." + "Please, use events filtering instead, e.g. Events.ITERATION_STARTED(every=1000)" + ) + if save_interval == 1: + # Do not break for old version who used `save_interval=1` + warnings.warn(msg) + else: + # No choice + raise ValueError(msg) disk_saver = DiskSaver( dirname, @@ -984,6 +1016,7 @@ def __init__( n_saved=n_saved, global_step_transform=global_step_transform, filename_pattern=filename_pattern, + archived=archived, include_self=include_self, greater_or_equal=greater_or_equal, save_on_rank=save_on_rank, @@ -1000,7 +1033,6 @@ def last_checkpoint(self) -> Optional[Union[str, Path]]: return self.save_handler.dirname / self._saved[-1].filename def __call__(self, engine: Engine, to_save: Mapping): # type: ignore - if len(to_save) == 0: raise RuntimeError("No objects to checkpoint found.") diff --git a/ignite/handlers/early_stopping.py b/ignite/handlers/early_stopping.py index 3eaed6791c72..d308f8499e00 100644 --- a/ignite/handlers/early_stopping.py +++ b/ignite/handlers/early_stopping.py @@ -50,7 +50,6 @@ def __init__( min_delta: float = 0.0, cumulative_delta: bool = False, ): - if not callable(score_function): raise TypeError("Argument score_function should be a function.") diff --git a/ignite/handlers/lr_finder.py b/ignite/handlers/lr_finder.py index 69c176e93da3..1aad00938337 100644 --- a/ignite/handlers/lr_finder.py +++ b/ignite/handlers/lr_finder.py @@ -94,7 +94,6 @@ def _run( smooth_f: float, diverge_th: float, ) -> None: - self._history = {"lr": [], "loss": []} self._best_loss = None self._diverge_flag = False @@ -106,7 +105,6 @@ def _run( max_iter = trainer.state.epoch_length * trainer.state.max_epochs # type: ignore[operator] if max_iter < num_iter: max_iter = num_iter - trainer.state.max_iters = num_iter trainer.state.max_epochs = ceil(num_iter / trainer.state.epoch_length) # type: ignore[operator] if not trainer.has_event_handler(self._reached_num_iterations): diff --git a/ignite/handlers/param_scheduler.py b/ignite/handlers/param_scheduler.py index 9ece11f106d5..c554b04bce70 100644 --- a/ignite/handlers/param_scheduler.py +++ b/ignite/handlers/param_scheduler.py @@ -10,7 +10,7 @@ from typing import Any, cast, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union import torch -from torch.optim.lr_scheduler import ReduceLROnPlateau +from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau from torch.optim.optimizer import Optimizer # https://github.com/pytorch/ignite/issues/2773 @@ -193,8 +193,7 @@ def __init__( self._state_attrs += ["param_group_index"] def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None: - - value = self.get_param() + value = self._get_param() if isinstance(value, list): if len(value) != len(self.optimizer_param_groups): @@ -262,6 +261,11 @@ def simulate_values(cls, num_events: int, **scheduler_kwargs: Any) -> List[List[ values.append([i, scheduler.optimizer_param_groups[0][scheduler.param_name]]) return values + def _get_param(self) -> Union[List[float], float]: + # `ParamScheduler` does nothing special, only returning what child class returns. + # Intermediate child classes edit this method + return self.get_param() + class CyclicalScheduler(ParamScheduler): """An abstract class for updating an optimizer's parameter value over a @@ -280,6 +284,9 @@ class CyclicalScheduler(ParamScheduler): end of each cycle (default=1.0). end_value_mult: ratio by which to change the end value at the end of each cycle (default=1.0). + warmup_duration: duration of warm-up to be applied before each cycle. + Through this warm-up, the parameter starts from the last cycle's end value + and linearly goes to next cycle's start value. Default is no cyclic warm-up. save_history: whether to log the parameter values to `engine.state.param_history`, (default=False). param_group_index: optimizer's parameters group to use. @@ -289,6 +296,9 @@ class CyclicalScheduler(ParamScheduler): usually be the number of batches in an epoch. .. versionadded:: 0.4.5 + + .. versionchanged:: 0.4.13 + Added cyclic warm-up to the scheduler using ``warmup_duration``. """ def __init__( @@ -301,6 +311,7 @@ def __init__( cycle_mult: float = 1.0, start_value_mult: float = 1.0, end_value_mult: float = 1.0, + warmup_duration: int = 0, save_history: bool = False, param_group_index: Optional[int] = None, ): @@ -309,11 +320,13 @@ def __init__( ) self.start_value = start_value self.end_value = end_value - self.cycle_size = int(cycle_size) # Ensure cycle_size is integer + self.cycle_size = cycle_size self.cycle_mult = cycle_mult self.cycle = 0 self.start_value_mult = start_value_mult self.end_value_mult = end_value_mult + self.warmup_duration = warmup_duration + self.total_cycle_size = self.warmup_duration + self.cycle_size if self.cycle_size < 2: raise ValueError(f"Argument cycle_size should be positive and larger than 1, but given {cycle_size}") @@ -326,18 +339,33 @@ def __init__( "cycle", "start_value_mult", "end_value_mult", + "warmup_duration", + "total_cycle_size", ] def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None: - if self.event_index != 0 and self.event_index % self.cycle_size == 0: + if self.event_index != 0 and self.event_index == self.cycle_size: + self.start_value *= self.start_value_mult + if self.event_index != 0 and self.event_index == self.total_cycle_size: self.event_index = 0 self.cycle_size = int(self.cycle_size * self.cycle_mult) + self.warmup_duration = int(self.warmup_duration * self.cycle_mult) + self.total_cycle_size = self.warmup_duration + self.cycle_size self.cycle += 1 - self.start_value *= self.start_value_mult self.end_value *= self.end_value_mult return super(CyclicalScheduler, self).__call__(engine, name) + def _get_param(self) -> Union[List[float], float]: + """Applies warm-up if the scheduler is in the warm-up phase, + otherwise returns what is returned by `self.get_param()` + """ + if self.event_index > self.cycle_size: + warmup_progress = (self.event_index - self.cycle_size) / self.warmup_duration + return self.end_value + (self.start_value - self.end_value) * warmup_progress + + return self.get_param() + class LinearCyclicalScheduler(CyclicalScheduler): """Linearly adjusts param value to 'end_value' for a half-cycle, then linearly @@ -356,6 +384,9 @@ class LinearCyclicalScheduler(CyclicalScheduler): end of each cycle (default=1.0). end_value_mult: ratio by which to change the end value at the end of each cycle (default=1.0). + warmup_duration: duration of warm-up to be applied before each cycle. + Through this warm-up, the parameter starts from the last cycle's end value + and linearly goes to next cycle's start value. Default is no cyclic warm-up. save_history: whether to log the parameter values to `engine.state.param_history`, (default=False). param_group_index: optimizer's parameters group to use. @@ -431,9 +462,13 @@ def print_lr(): ... .. versionadded:: 0.4.5 + + .. versionchanged:: 0.4.13 + Added cyclic warm-up to the scheduler using ``warmup_duration``. """ def get_param(self) -> float: + """Method to get current optimizer's parameter value""" cycle_progress = self.event_index / self.cycle_size return self.end_value + (self.start_value - self.end_value) * abs(cycle_progress - 0.5) * 2 @@ -457,6 +492,9 @@ class CosineAnnealingScheduler(CyclicalScheduler): end of each cycle (default=1.0). end_value_mult: ratio by which to change the end value at the end of each cycle (default=1.0). + warmup_duration: duration of warm-up to be applied before each cycle. + Through this warm-up, the parameter starts from the last cycle's end value + and linearly goes to next cycle's start value. Default is no cyclic warm-up. save_history: whether to log the parameter values to `engine.state.param_history`, (default=False). param_group_index: optimizer's parameters group to use. @@ -535,6 +573,9 @@ def print_lr(): Applications of Computer Vision (WACV), 2017 IEEE Winter Conference on. IEEE, 2017 .. versionadded:: 0.4.5 + + .. versionchanged:: 0.4.13 + Added cyclic warm-up to the scheduler using ``warmup_duration``. """ def get_param(self) -> float: @@ -595,7 +636,6 @@ def print_lr(): """ def __init__(self, schedulers: List[ParamScheduler], durations: List[int], save_history: bool = False): - if not isinstance(schedulers, Sequence): raise TypeError(f"Argument schedulers should be a sequence, but given {schedulers}") @@ -794,6 +834,57 @@ def simulate_values( # type: ignore[override] return output +class _CosineAnnealingWarmRestarts: + def __init__(self, lr_scheduler: CosineAnnealingWarmRestarts): + self._lr_scheduler = lr_scheduler + + @property + def last_epoch(self) -> int: + return self._lr_scheduler.last_epoch + + @last_epoch.setter + def last_epoch(self, value: int) -> None: + self._lr_scheduler.last_epoch = value + + @property + def optimizer(self) -> torch.optim.Optimizer: + return self._lr_scheduler.optimizer + + def get_lr(self, epoch: Optional[int] = None) -> List[float]: + T_mult = self._lr_scheduler.T_mult + eta_min = self._lr_scheduler.eta_min + + if epoch is None and self.last_epoch < 0: + epoch = 0 + if epoch is None: + epoch = self.last_epoch + 1 + self._lr_scheduler.T_cur = self._lr_scheduler.T_cur + 1 + if self._lr_scheduler.T_cur >= self._lr_scheduler.T_i: + self._lr_scheduler.T_cur = self._lr_scheduler.T_cur - self._lr_scheduler.T_i + self._lr_scheduler.T_i = self._lr_scheduler.T_i * T_mult + else: + if epoch < 0: + raise ValueError("Expected non-negative epoch, but got {}".format(epoch)) + if epoch >= self._lr_scheduler.T_0: + if T_mult == 1: + self._lr_scheduler.T_cur = epoch % self._lr_scheduler.T_0 + else: + n = int(math.log((epoch / self._lr_scheduler.T_0 * (T_mult - 1) + 1), T_mult)) + self._lr_scheduler.T_cur = epoch - self._lr_scheduler.T_0 * (T_mult**n - 1) / (T_mult - 1) + self._lr_scheduler.T_i = self._lr_scheduler.T_0 * T_mult**n + else: + self._lr_scheduler.T_i = self._lr_scheduler.T_0 + self._lr_scheduler.T_cur = epoch + + self.last_epoch = math.floor(epoch) + + return [ + eta_min + + (base_lr - eta_min) * (1 + math.cos(math.pi * self._lr_scheduler.T_cur / self._lr_scheduler.T_i)) / 2 + for base_lr in self._lr_scheduler.base_lrs + ] + + class LRScheduler(ParamScheduler): """A wrapper class to call `torch.optim.lr_scheduler` objects as `ignite` handlers. @@ -848,7 +939,6 @@ def __init__( save_history: bool = False, use_legacy: bool = False, ): - if not isinstance(lr_scheduler, PyTorchLRScheduler): raise TypeError( "Argument lr_scheduler should be a subclass of " @@ -856,7 +946,10 @@ def __init__( f"but given {type(lr_scheduler)}" ) - self.lr_scheduler = lr_scheduler + self.lr_scheduler: Union[PyTorchLRScheduler, _CosineAnnealingWarmRestarts] = lr_scheduler + if isinstance(lr_scheduler, CosineAnnealingWarmRestarts): + self.lr_scheduler = _CosineAnnealingWarmRestarts(lr_scheduler) + super(LRScheduler, self).__init__( optimizer=self.lr_scheduler.optimizer, param_name="lr", @@ -866,7 +959,7 @@ def __init__( warnings.warn( "Please make sure to attach scheduler to Events.ITERATION_COMPLETED " "instead of Events.ITERATION_STARTED to make sure to use " - "the first lr value from the optimizer, otherwise it is will be skipped" + "the first lr value from the optimizer, otherwise it will be skipped" ) self.lr_scheduler.last_epoch += 1 @@ -879,9 +972,9 @@ def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None def get_param(self) -> Union[float, List[float]]: """Method to get current optimizer's parameter value""" # Emulate context manager for pytorch>=1.4 - self.lr_scheduler._get_lr_called_within_step = True # type: ignore[attr-defined] + self.lr_scheduler._get_lr_called_within_step = True # type: ignore[union-attr] lr_list = cast(List[float], self.lr_scheduler.get_lr()) - self.lr_scheduler._get_lr_called_within_step = False # type: ignore[attr-defined] + self.lr_scheduler._get_lr_called_within_step = False # type: ignore[union-attr] if len(lr_list) == 1: return lr_list[0] else: @@ -1019,7 +1112,6 @@ def print_lr(): warmup_schedulers: List[ParamScheduler] = [] for param_group_index, param_group in enumerate(lr_scheduler.optimizer.param_groups): - if warmup_end_value is None: param_group_warmup_end_value = param_group["lr"] else: @@ -1458,20 +1550,19 @@ class ReduceLROnPlateauScheduler(ParamScheduler): Default: False. param_group_index: `optimizer`'s parameters group to use. Default: None. Use all `optimizer`'s paramater groups. - **scheduler_kwargs: Keyword arguments to be passed to the wrapped - `ReduceLROnPlateau`. + scheduler_kwargs: Keyword arguments to be passed to the wrapped ``ReduceLROnPlateau``. Examples: - .. code-block python + .. code-block:: python - # Metric 'metric-name' should surpass its best value by + # Metric "accuracy" should increase the best value by # more than 1 unit after at most 2 epochs, otherwise LR # would get multiplied by 0.5 . scheduler = ReduceLROnPlateauScheduler( default_optimizer, - metric_name="metric-name", mode="max", + metric_name="accuracy", mode="max", factor=0.5, patience=1, threshold_mode='abs', threshold=1, trainer=trainer ) @@ -1488,10 +1579,10 @@ class ReduceLROnPlateauScheduler(ParamScheduler): default_trainer = get_default_trainer() - # Metric `loss` should decrease more than - # a tenth of best loss after at most + # Metric "loss" should decrease more than + # 0.1 of best loss after at most # three iterations. Then best loss would get - # updated, otherwise lr is multiplied by 2 + # updated, otherwise lr is multiplied by 0.5 scheduler = ReduceLROnPlateauScheduler( default_optimizer, "loss", diff --git a/ignite/handlers/state_param_scheduler.py b/ignite/handlers/state_param_scheduler.py index 03099c59739e..5922ee1f8b24 100644 --- a/ignite/handlers/state_param_scheduler.py +++ b/ignite/handlers/state_param_scheduler.py @@ -382,7 +382,7 @@ def __init__( self._state_attrs += ["initial_value", "gamma"] def get_param(self) -> Union[List[float], float]: - return self.initial_value * self.gamma ** self.event_index + return self.initial_value * self.gamma**self.event_index class StepStateScheduler(StateParamScheduler): diff --git a/ignite/handlers/terminate_on_nan.py b/ignite/handlers/terminate_on_nan.py index 32f8053c652a..bf8baa5a73e0 100644 --- a/ignite/handlers/terminate_on_nan.py +++ b/ignite/handlers/terminate_on_nan.py @@ -40,7 +40,6 @@ def __call__(self, engine: Engine) -> None: output = self._output_transform(engine.state.output) def raise_error(x: Union[float, torch.Tensor]) -> None: - if isinstance(x, numbers.Number): x = torch.tensor(x) diff --git a/ignite/handlers/time_limit.py b/ignite/handlers/time_limit.py index e5fb4ad7a25f..4ab934a2534d 100644 --- a/ignite/handlers/time_limit.py +++ b/ignite/handlers/time_limit.py @@ -29,7 +29,6 @@ class TimeLimit: """ def __init__(self, limit_sec: Optional[int] = 28800): - if not isinstance(limit_sec, int): raise TypeError("Argument limit_sec should be an integer.") if limit_sec <= 0: diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py index af73f88266c8..426b35a21abc 100644 --- a/ignite/metrics/accumulation.py +++ b/ignite/metrics/accumulation.py @@ -38,6 +38,7 @@ class VariableAccumulation(Metric): """ required_output_keys = None + _state_dict_all_req_keys = ("accumulator", "num_examples") def __init__( self, diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py index 9548f962bf98..0bfe62b85b7b 100644 --- a/ignite/metrics/accuracy.py +++ b/ignite/metrics/accuracy.py @@ -51,10 +51,10 @@ def _check_shape(self, output: Sequence[torch.Tensor]) -> None: def _check_binary_multilabel_cases(self, output: Sequence[torch.Tensor]) -> None: y_pred, y = output - if not torch.equal(y, y ** 2): + if not torch.equal(y, y**2): raise ValueError("For binary cases, y must be comprised of 0's and 1's.") - if not torch.equal(y_pred, y_pred ** 2): + if not torch.equal(y_pred, y_pred**2): raise ValueError("For binary cases, y_pred must be comprised of 0's and 1's.") def _check_type(self, output: Sequence[torch.Tensor]) -> None: @@ -208,6 +208,8 @@ def thresholded_output_transform(output): 0.6666... """ + _state_dict_all_req_keys = ("_num_correct", "_num_examples") + def __init__( self, output_transform: Callable = lambda x: x, diff --git a/ignite/metrics/classification_report.py b/ignite/metrics/classification_report.py index 40809b0eef5c..55613dc8d8cd 100644 --- a/ignite/metrics/classification_report.py +++ b/ignite/metrics/classification_report.py @@ -131,12 +131,12 @@ def _wrapper( dict_obj[_get_label_for_class(idx)] = { "precision": p_label.item(), "recall": re[idx].item(), - "f{0}-score".format(beta): f[idx].item(), + f"f{beta}-score": f[idx].item(), } dict_obj["macro avg"] = { "precision": a_pr.item(), "recall": a_re.item(), - "f{0}-score".format(beta): a_f.item(), + f"f{beta}-score": a_f.item(), } return dict_obj if output_dict else json.dumps(dict_obj) diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py index a5021631cddb..a55bbedebb8f 100644 --- a/ignite/metrics/confusion_matrix.py +++ b/ignite/metrics/confusion_matrix.py @@ -99,6 +99,8 @@ def binary_one_hot_output_transform(output): [1, 1]]) """ + _state_dict_all_req_keys = ("confusion_matrix", "_num_examples") + def __init__( self, num_classes: int, @@ -166,7 +168,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None: y_pred = y_pred[target_mask] indices = self.num_classes * y + y_pred - m = torch.bincount(indices, minlength=self.num_classes ** 2).reshape(self.num_classes, self.num_classes) + m = torch.bincount(indices, minlength=self.num_classes**2).reshape(self.num_classes, self.num_classes) self.confusion_matrix += m.to(self.confusion_matrix) @sync_all_reduce("confusion_matrix", "_num_examples") diff --git a/ignite/metrics/epoch_metric.py b/ignite/metrics/epoch_metric.py index 21b199bfd542..116a841e49ff 100644 --- a/ignite/metrics/epoch_metric.py +++ b/ignite/metrics/epoch_metric.py @@ -67,6 +67,8 @@ def mse_fn(y_preds, y_targets): To disable the warning, set ``check_compute_fn=False``. """ + _state_dict_all_req_keys = ("_predictions", "_targets") + def __init__( self, compute_fn: Callable[[torch.Tensor, torch.Tensor], float], @@ -74,7 +76,6 @@ def __init__( check_compute_fn: bool = True, device: Union[str, torch.device] = torch.device("cpu"), ) -> None: - if not callable(compute_fn): raise TypeError("Argument compute_fn should be callable.") diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py index 6e87ed4910e5..6522efc64231 100644 --- a/ignite/metrics/fbeta.py +++ b/ignite/metrics/fbeta.py @@ -167,7 +167,7 @@ def thresholded_output_transform(output): elif recall._average: raise ValueError("Input recall metric should have average=False") - fbeta = (1.0 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall + 1e-15) + fbeta = (1.0 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-15) if average: fbeta = fbeta.mean().item() diff --git a/ignite/metrics/gan/fid.py b/ignite/metrics/gan/fid.py index fc1065f697b8..54c5df06f57d 100644 --- a/ignite/metrics/gan/fid.py +++ b/ignite/metrics/gan/fid.py @@ -21,7 +21,6 @@ def fid_score( mu1: torch.Tensor, mu2: torch.Tensor, sigma1: torch.Tensor, sigma2: torch.Tensor, eps: float = 1e-6 ) -> float: - try: import numpy as np except ImportError: @@ -164,6 +163,8 @@ def forward(self, x): .. versionadded:: 0.4.6 """ + _state_dict_all_req_keys = ("_num_examples", "_train_total", "_test_total", "_train_sigma", "_test_sigma") + def __init__( self, num_features: Optional[int] = None, @@ -171,7 +172,6 @@ def __init__( output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"), ) -> None: - try: import numpy as np # noqa: F401 except ImportError: @@ -197,7 +197,6 @@ def __init__( @staticmethod def _online_update(features: torch.Tensor, total: torch.Tensor, sigma: torch.Tensor) -> None: - total += features sigma += torch_outer(features, features) @@ -213,7 +212,6 @@ def _get_covariance(self, sigma: torch.Tensor, total: torch.Tensor) -> torch.Ten @reinit__is_reduced def reset(self) -> None: - self._train_sigma = torch.zeros( (self._num_features, self._num_features), dtype=torch.float64, device=self._device ) @@ -231,7 +229,6 @@ def reset(self) -> None: @reinit__is_reduced def update(self, output: Sequence[torch.Tensor]) -> None: - train, test = output train_features = self._extract_features(train) test_features = self._extract_features(test) @@ -255,7 +252,6 @@ def update(self, output: Sequence[torch.Tensor]) -> None: @sync_all_reduce("_num_examples", "_train_total", "_test_total", "_train_sigma", "_test_sigma") def compute(self) -> float: - fid = fid_score( mu1=self._train_total / self._num_examples, mu2=self._test_total / self._num_examples, diff --git a/ignite/metrics/gan/inception_score.py b/ignite/metrics/gan/inception_score.py index 9676194019d7..60b1d4785f71 100644 --- a/ignite/metrics/gan/inception_score.py +++ b/ignite/metrics/gan/inception_score.py @@ -77,6 +77,8 @@ class InceptionScore(_BaseInceptionMetric): .. versionadded:: 0.4.6 """ + _state_dict_all_req_keys = ("_num_examples", "_prob_total", "_total_kl_d") + def __init__( self, num_features: Optional[int] = None, @@ -84,7 +86,6 @@ def __init__( output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"), ) -> None: - if num_features is None and feature_extractor is None: num_features = 1000 feature_extractor = InceptionModel(return_features=False, device=device) @@ -100,7 +101,6 @@ def __init__( @reinit__is_reduced def reset(self) -> None: - self._num_examples = 0 self._prob_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device) @@ -110,7 +110,6 @@ def reset(self) -> None: @reinit__is_reduced def update(self, output: torch.Tensor) -> None: - probabilities = self._extract_features(output) prob_sum = torch.sum(probabilities, 0, dtype=torch.float64) @@ -125,7 +124,6 @@ def update(self, output: torch.Tensor) -> None: @sync_all_reduce("_num_examples", "_prob_total", "_total_kl_d") def compute(self) -> float: - if self._num_examples == 0: raise NotComputableError("InceptionScore must have at least one example before it can be computed.") diff --git a/ignite/metrics/gan/utils.py b/ignite/metrics/gan/utils.py index dc501867353b..f8226dafd1df 100644 --- a/ignite/metrics/gan/utils.py +++ b/ignite/metrics/gan/utils.py @@ -57,7 +57,6 @@ def __init__( output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu"), ) -> None: - if num_features is None: raise ValueError("Argument num_features must be provided, if feature_extractor is specified.") @@ -78,7 +77,6 @@ def __init__( super(_BaseInceptionMetric, self).__init__(output_transform=output_transform, device=device) def _check_feature_shapes(self, samples: torch.Tensor) -> None: - if samples.dim() != 2: raise ValueError(f"feature_extractor output must be a tensor of dim 2, got: {samples.dim()}") @@ -91,7 +89,6 @@ def _check_feature_shapes(self, samples: torch.Tensor) -> None: ) def _extract_features(self, inputs: torch.Tensor) -> torch.Tensor: - inputs = inputs.detach() if inputs.device != torch.device(self._device): diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py index 71f67db4979f..7182e7033d54 100644 --- a/ignite/metrics/loss.py +++ b/ignite/metrics/loss.py @@ -65,6 +65,7 @@ class Loss(Metric): """ required_output_keys = ("y_pred", "y", "criterion_kwargs") + _state_dict_all_req_keys = ("_sum", "_num_examples") def __init__( self, diff --git a/ignite/metrics/mean_absolute_error.py b/ignite/metrics/mean_absolute_error.py index 6066e8cda341..eb90d3aa3c24 100644 --- a/ignite/metrics/mean_absolute_error.py +++ b/ignite/metrics/mean_absolute_error.py @@ -59,6 +59,8 @@ class MeanAbsoluteError(Metric): 2.9375 """ + _state_dict_all_req_keys = ("_sum_of_absolute_errors", "_num_examples") + @reinit__is_reduced def reset(self) -> None: self._sum_of_absolute_errors = torch.tensor(0.0, device=self._device) diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py index 4b33dc4cbab5..79676564e5fb 100644 --- a/ignite/metrics/mean_pairwise_distance.py +++ b/ignite/metrics/mean_pairwise_distance.py @@ -59,6 +59,8 @@ class MeanPairwiseDistance(Metric): 1.5955... """ + _state_dict_all_req_keys = ("_sum_of_distances", "_num_examples") + def __init__( self, p: int = 2, diff --git a/ignite/metrics/mean_squared_error.py b/ignite/metrics/mean_squared_error.py index 3752b728138d..3407b4adcb70 100644 --- a/ignite/metrics/mean_squared_error.py +++ b/ignite/metrics/mean_squared_error.py @@ -59,6 +59,8 @@ class MeanSquaredError(Metric): 3.828125 """ + _state_dict_all_req_keys = ("_sum_of_squared_errors", "_num_examples") + @reinit__is_reduced def reset(self) -> None: self._sum_of_squared_errors = torch.tensor(0.0, device=self._device) diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py index 26cb3c12560d..39e5cb745222 100644 --- a/ignite/metrics/metric.py +++ b/ignite/metrics/metric.py @@ -1,18 +1,31 @@ from abc import ABCMeta, abstractmethod +from collections import OrderedDict from collections.abc import Mapping from functools import wraps from numbers import Number -from typing import Any, Callable, cast, Dict, Optional, Sequence, Tuple, TYPE_CHECKING, Union +from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING, Union import torch import ignite.distributed as idist + +from ignite.base.mixins import Serializable from ignite.engine import CallableEventWithFilter, Engine, Events +from ignite.utils import _CollectionItem, _tree_apply2, _tree_map if TYPE_CHECKING: from ignite.metrics.metrics_lambda import MetricsLambda -__all__ = ["Metric", "MetricUsage", "EpochWise", "BatchWise", "BatchFiltered"] +__all__ = [ + "Metric", + "MetricUsage", + "EpochWise", + "BatchWise", + "BatchFiltered", + "RunningEpochWise", + "RunningBatchWise", + "SingleEpochRunningBatchWise", +] class MetricUsage: @@ -31,6 +44,8 @@ class MetricUsage: :meth:`~ignite.metrics.metric.Metric.iteration_completed`. """ + usage_name: str + def __init__(self, started: Events, completed: Events, iteration_completed: CallableEventWithFilter) -> None: self.__started = started self.__completed = completed @@ -74,6 +89,33 @@ def __init__(self) -> None: ) +class RunningEpochWise(EpochWise): + """ + Running epoch-wise usage of Metrics. It's the running version of the :class:`~.metrics.metric.EpochWise` metric + usage. A metric with such a usage most likely accompanies an :class:`~.metrics.metric.EpochWise` one to compute + a running measure of it e.g. running average. + + Metric's methods are triggered on the following engine events: + + - :meth:`~ignite.metrics.metric.Metric.started` on every ``STARTED`` + (See :class:`~ignite.engine.events.Events`). + - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``EPOCH_COMPLETED``. + - :meth:`~ignite.metrics.metric.Metric.completed` on every ``EPOCH_COMPLETED``. + + Attributes: + usage_name: usage name string + """ + + usage_name: str = "running_epoch_wise" + + def __init__(self) -> None: + super(EpochWise, self).__init__( + started=Events.STARTED, + completed=Events.EPOCH_COMPLETED, + iteration_completed=Events.EPOCH_COMPLETED, + ) + + class BatchWise(MetricUsage): """ Batch-wise usage of Metrics. @@ -99,6 +141,59 @@ def __init__(self) -> None: ) +class RunningBatchWise(BatchWise): + """ + Running batch-wise usage of Metrics. It's the running version of the :class:`~.metrics.metric.EpochWise` metric + usage. A metric with such a usage could for example accompany a :class:`~.metrics.metric.BatchWise` one to compute + a running measure of it e.g. running average. + + Metric's methods are triggered on the following engine events: + + - :meth:`~ignite.metrics.metric.Metric.started` on every ``STARTED`` + (See :class:`~ignite.engine.events.Events`). + - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``ITERATION_COMPLETED``. + - :meth:`~ignite.metrics.metric.Metric.completed` on every ``ITERATION_COMPLETED``. + + Attributes: + usage_name: usage name string + """ + + usage_name: str = "running_batch_wise" + + def __init__(self) -> None: + super(BatchWise, self).__init__( + started=Events.STARTED, + completed=Events.ITERATION_COMPLETED, + iteration_completed=Events.ITERATION_COMPLETED, + ) + + +class SingleEpochRunningBatchWise(BatchWise): + """ + Running batch-wise usage of Metrics in a single epoch. It's like :class:`~.metrics.metric.RunningBatchWise` metric + usage with the difference that is used during a single epoch. + + Metric's methods are triggered on the following engine events: + + - :meth:`~ignite.metrics.metric.Metric.started` on every ``EPOCH_STARTED`` + (See :class:`~ignite.engine.events.Events`). + - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``ITERATION_COMPLETED``. + - :meth:`~ignite.metrics.metric.Metric.completed` on every ``ITERATION_COMPLETED``. + + Attributes: + usage_name: usage name string + """ + + usage_name: str = "single_epoch_running_batch_wise" + + def __init__(self) -> None: + super(BatchWise, self).__init__( + started=Events.EPOCH_STARTED, + completed=Events.ITERATION_COMPLETED, + iteration_completed=Events.ITERATION_COMPLETED, + ) + + class BatchFiltered(MetricUsage): """ Batch filtered usage of Metrics. This usage is similar to epoch-wise but update event is filtered. @@ -125,7 +220,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: ) -class Metric(metaclass=ABCMeta): +class Metric(Serializable, metaclass=ABCMeta): """ Base class for all Metrics. @@ -219,7 +314,7 @@ def __init__( @abstractmethod def reset(self) -> None: """ - Resets the metric to it's initial state. + Resets the metric to its initial state. By default, this is called at the start of each epoch. """ @@ -240,7 +335,7 @@ def update(self, output: Any) -> None: @abstractmethod def compute(self) -> Any: """ - Computes the metric based on it's accumulated state. + Computes the metric based on its accumulated state. By default, this is called at the end of each epoch. @@ -273,7 +368,7 @@ def iteration_completed(self, engine: Engine) -> None: Note: ``engine.state.output`` is used to compute metric values. - The majority of implemented metrics accepts the following formats for ``engine.state.output``: + The majority of implemented metrics accept the following formats for ``engine.state.output``: ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``. ``y_pred`` and ``y`` can be torch tensors or list of tensors/numbers if applicable. @@ -344,12 +439,16 @@ def completed(self, engine: Engine, name: str) -> None: def _check_usage(self, usage: Union[str, MetricUsage]) -> MetricUsage: if isinstance(usage, str): - if usage == EpochWise.usage_name: - usage = EpochWise() - elif usage == BatchWise.usage_name: - usage = BatchWise() - else: - raise ValueError(f"usage should be 'EpochWise.usage_name' or 'BatchWise.usage_name', get {usage}") + usages = [EpochWise, RunningEpochWise, BatchWise, RunningBatchWise, SingleEpochRunningBatchWise] + for usage_cls in usages: + if usage == usage_cls.usage_name: + usage = usage_cls() + break + if not isinstance(usage, MetricUsage): + raise ValueError( + "Argument usage should be '(Running)EpochWise.usage_name' or " + f"'((SingleEpoch)Running)BatchWise.usage_name', got {usage}" + ) if not isinstance(usage, MetricUsage): raise TypeError(f"Unhandled usage type {type(usage)}") return usage @@ -451,6 +550,97 @@ def is_attached(self, engine: Engine, usage: Union[str, MetricUsage] = EpochWise usage = self._check_usage(usage) return engine.has_event_handler(self.completed, usage.COMPLETED) + def _state_dict_per_rank(self) -> OrderedDict: + def func( + x: Union[torch.Tensor, Metric, None, float], **kwargs: Any + ) -> Union[torch.Tensor, float, OrderedDict, None]: + if isinstance(x, Metric): + return x._state_dict_per_rank() + if x is None or isinstance(x, (int, float, torch.Tensor)): + return x + else: + raise TypeError( + "Found attribute of unsupported type. Currently, supported types include" + " numeric types, tensor, Metric or sequence/mapping of metrics." + ) + + state: OrderedDict[str, Union[torch.Tensor, List, Dict, None]] = OrderedDict() + for attr_name in self._state_dict_all_req_keys: + if attr_name not in self.__dict__: + raise ValueError( + f"Found a value in _state_dict_all_req_keys that is not among metric attributes: {attr_name}" + ) + attr = getattr(self, attr_name) + state[attr_name] = _tree_map(func, attr) # type: ignore[assignment] + + return state + + __state_dict_key_per_rank: str = "__metric_state_per_rank" + + def state_dict(self) -> OrderedDict: + """Method returns state dict with attributes of the metric specified in its + `_state_dict_all_req_keys` attribute. Can be used to save internal state of the class. + """ + state = self._state_dict_per_rank() + + if idist.get_world_size() > 1: + return OrderedDict([(Metric.__state_dict_key_per_rank, idist.all_gather(state))]) + return OrderedDict([(Metric.__state_dict_key_per_rank, [state])]) + + def _load_state_dict_per_rank(self, state_dict: Mapping) -> None: + super().load_state_dict(state_dict) + + def func(x: Any, y: Any) -> None: + if isinstance(x, Metric): + x._load_state_dict_per_rank(y) + elif isinstance(x, _CollectionItem): + value = x.value() + if y is None or isinstance(y, _CollectionItem.types_as_collection_item): + x.load_value(y) + elif isinstance(value, Metric): + value._load_state_dict_per_rank(y) + else: + raise ValueError(f"Unsupported type for provided state_dict data: {type(y)}") + + for attr_name in self._state_dict_all_req_keys: + attr = getattr(self, attr_name) + attr = _CollectionItem.wrap(self.__dict__, attr_name, attr) + _tree_apply2(func, attr, state_dict[attr_name]) + + def load_state_dict(self, state_dict: Mapping) -> None: + """Method replaces internal state of the class with provided state dict data. + + If there's an active distributed configuration, the process uses its rank to pick the proper value from + the list of values saved under each attribute's name in the dict. + + Args: + state_dict: a dict containing attributes of the metric specified in its `_state_dict_all_req_keys` + attribute. + """ + if not isinstance(state_dict, Mapping): + raise TypeError(f"Argument state_dict should be a dictionary, but given {type(state_dict)}") + + if not (len(state_dict) == 1 and Metric.__state_dict_key_per_rank in state_dict): + raise ValueError( + "Incorrect state_dict object. Argument state_dict should be a dictionary " + "provided by Metric.state_dict(). " + f"Expected single key: {Metric.__state_dict_key_per_rank}, but given {state_dict.keys()}" + ) + + list_state_dicts_per_rank = state_dict[Metric.__state_dict_key_per_rank] + rank = idist.get_rank() + world_size = idist.get_world_size() + if len(list_state_dicts_per_rank) != world_size: + raise ValueError( + "Incorrect state_dict object. Argument state_dict should be a dictionary " + "provided by Metric.state_dict(). " + f"Expected a list of state_dicts of size equal world_size: {world_size}, " + f"but got {len(list_state_dicts_per_rank)}" + ) + + state_dict = list_state_dicts_per_rank[rank] + self._load_state_dict_per_rank(state_dict) + def __add__(self, other: Any) -> "MetricsLambda": from ignite.metrics.metrics_lambda import MetricsLambda @@ -484,12 +674,12 @@ def __rmul__(self, other: Any) -> "MetricsLambda": def __pow__(self, other: Any) -> "MetricsLambda": from ignite.metrics.metrics_lambda import MetricsLambda - return MetricsLambda(lambda x, y: x ** y, self, other) + return MetricsLambda(lambda x, y: x**y, self, other) def __rpow__(self, other: Any) -> "MetricsLambda": from ignite.metrics.metrics_lambda import MetricsLambda - return MetricsLambda(lambda x, y: x ** y, other, self) + return MetricsLambda(lambda x, y: x**y, other, self) def __mod__(self, other: Any) -> "MetricsLambda": from ignite.metrics.metrics_lambda import MetricsLambda diff --git a/ignite/metrics/metrics_lambda.py b/ignite/metrics/metrics_lambda.py index 36e9f1a26eb0..6308e3871380 100644 --- a/ignite/metrics/metrics_lambda.py +++ b/ignite/metrics/metrics_lambda.py @@ -90,9 +90,11 @@ def Fbeta(r, p, beta): assert not precision.is_attached(engine) """ + _state_dict_all_req_keys = ("_updated", "args", "kwargs") + def __init__(self, f: Callable, *args: Any, **kwargs: Any) -> None: self.function = f - self.args = args + self.args = list(args) # we need args to be a list instead of a tuple for state_dict/load_state_dict feature self.kwargs = kwargs self.engine: Optional[Engine] = None self._updated = False diff --git a/ignite/metrics/multilabel_confusion_matrix.py b/ignite/metrics/multilabel_confusion_matrix.py index fef9ad5ac467..2a7b25d68c67 100644 --- a/ignite/metrics/multilabel_confusion_matrix.py +++ b/ignite/metrics/multilabel_confusion_matrix.py @@ -81,6 +81,8 @@ class MultiLabelConfusionMatrix(Metric): """ + _state_dict_all_req_keys = ("confusion_matrix", "_num_examples") + def __init__( self, num_classes: int, @@ -164,8 +166,8 @@ def _check_input(self, output: Sequence[torch.Tensor]) -> None: if y.dtype not in valid_types: raise ValueError(f"y must be of any type: {valid_types}") - if not torch.equal(y_pred, y_pred ** 2): + if not torch.equal(y_pred, y_pred**2): raise ValueError("y_pred must be a binary tensor") - if not torch.equal(y, y ** 2): + if not torch.equal(y, y**2): raise ValueError("y must be a binary tensor") diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py index 7c84c7915d19..ed3b14b4dc52 100644 --- a/ignite/metrics/nlp/bleu.py +++ b/ignite/metrics/nlp/bleu.py @@ -49,7 +49,6 @@ def smooth2(numerators: torch.Tensor, denominators: torch.Tensor) -> Sequence[fl @staticmethod def _smooth2(numerators: torch.Tensor, denominators: torch.Tensor) -> Sequence[float]: - return [ (n.item() + 1) / (d.item() + 1) if i != 0 else n.item() / d.item() for i, (n, d) in enumerate(zip(numerators, denominators)) @@ -148,6 +147,11 @@ def __init__( raise ValueError(f'Average must be either "macro" or "micro" (got: {average})') self.average = average + if average == "micro": + self._state_dict_all_req_keys = ("p_numerators", "p_denominators", "hyp_length_sum", "ref_length_sum") + else: + self._state_dict_all_req_keys = ("_sum_of_bleu", "_num_sentences") + super(Bleu, self).__init__(output_transform=output_transform, device=device) def _n_gram_counter( @@ -157,7 +161,6 @@ def _n_gram_counter( p_numerators: torch.Tensor, p_denominators: torch.Tensor, ) -> Tuple[int, int]: - if len(references) != len(candidates): raise ValueError( f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != " @@ -187,7 +190,6 @@ def _n_gram_counter( def _brevity_penalty_smoothing( self, p_numerators: torch.Tensor, p_denominators: torch.Tensor, hyp_length_sum: int, ref_length_sum: int ) -> float: - # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. @@ -216,7 +218,6 @@ def _sentence_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequen return self._corpus_bleu([references], [candidates]) def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates: Sequence[Sequence[Any]]) -> float: - p_numerators: torch.Tensor = torch.zeros(self.ngrams_order + 1) p_denominators: torch.Tensor = torch.zeros(self.ngrams_order + 1) @@ -234,7 +235,6 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates @reinit__is_reduced def reset(self) -> None: - if self.average == "macro": self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device) self._num_sentences = 0 @@ -270,7 +270,6 @@ def _compute_macro(self) -> torch.Tensor: @sync_all_reduce("p_numerators", "p_denominators", "hyp_length_sum", "ref_length_sum") def _compute_micro(self) -> float: - bleu_score = self._brevity_penalty_smoothing( p_numerators=self.p_numerators, p_denominators=self.p_denominators, diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py index 93076d4639c3..9aa87a269e61 100644 --- a/ignite/metrics/nlp/rouge.py +++ b/ignite/metrics/nlp/rouge.py @@ -119,6 +119,8 @@ class _BaseRouge(Metric): Rouge interface for Rouge-L and Rouge-N """ + _state_dict_all_req_keys = ("_recall", "_precision", "_fmeasure", "_num_examples") + def __init__( self, multiref: str = "average", @@ -378,6 +380,8 @@ class Rouge(Metric): ``update`` method has changed and now works on batch of inputs. """ + _state_dict_all_req_keys = ("internal_metrics",) + def __init__( self, variants: Optional[Sequence[Union[str, int]]] = None, diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py index 090651720a7f..31fbd42b19b4 100644 --- a/ignite/metrics/precision.py +++ b/ignite/metrics/precision.py @@ -13,6 +13,8 @@ class _BasePrecisionRecall(_BaseClassification): + _state_dict_all_req_keys = ("_numerator", "_denominator", "_weight", "_updated") + def __init__( self, output_transform: Callable = lambda x: x, @@ -20,7 +22,6 @@ def __init__( is_multilabel: bool = False, device: Union[str, torch.device] = torch.device("cpu"), ): - if not (average is None or isinstance(average, bool) or average in ["macro", "micro", "weighted", "samples"]): raise ValueError( "Argument average should be None or a boolean or one of values" @@ -59,12 +60,11 @@ def _prepare_output(self, output: Sequence[torch.Tensor]) -> Sequence[torch.Tens y_pred, y = output[0].detach(), output[1].detach() if self._type == "binary" or self._type == "multiclass": - num_classes = 2 if self._type == "binary" else y_pred.size(1) if self._type == "multiclass" and y.max() + 1 > num_classes: raise ValueError( - f"y_pred contains less classes than y. Number of predicted classes is {num_classes}" - f" and element in y has invalid class = {y.max().item() + 1}." + f"y_pred contains fewer classes than y. Number of classes in the prediction is {num_classes}" + f" and an element in y has invalid class = {y.max().item() + 1}." ) y = y.view(-1) if self._type == "binary" and self._average is False: @@ -88,31 +88,32 @@ def _prepare_output(self, output: Sequence[torch.Tensor]) -> Sequence[torch.Tens @reinit__is_reduced def reset(self) -> None: - - # `numerator`, `denominator` and `weight` are three variables chosen to be abstract - # representatives of the ones that are measured for cases with different `average` parameters. - # `weight` is only used when `average='weighted'`. Actual value of these three variables is - # as follows. - # - # average='samples': - # numerator (torch.Tensor): sum of metric value for samples - # denominator (int): number of samples - # - # average='weighted': - # numerator (torch.Tensor): number of true positives per class/label - # denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) - # positives per class/label - # weight (torch.Tensor): number of actual positives per class - # - # average='micro': - # numerator (torch.Tensor): sum of number of true positives for classes/labels - # denominator (torch.Tensor): sum of number of predicted(for precision) or actual(for recall) positives - # for classes/labels - # - # average='macro' or boolean or None: - # numerator (torch.Tensor): number of true positives per class/label - # denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) - # positives per class/label + """ + `numerator`, `denominator` and `weight` are three variables chosen to be abstract + representatives of the ones that are measured for cases with different `average` parameters. + `weight` is only used when `average='weighted'`. Actual value of these three variables is + as follows. + + average='samples': + numerator (torch.Tensor): sum of metric value for samples + denominator (int): number of samples + + average='weighted': + numerator (torch.Tensor): number of true positives per class/label + denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) positives per + class/label. + weight (torch.Tensor): number of actual positives per class + + average='micro': + numerator (torch.Tensor): sum of number of true positives for classes/labels + denominator (torch.Tensor): sum of number of predicted(for precision) or actual(for recall) positives for + classes/labels. + + average='macro' or boolean or None: + numerator (torch.Tensor): number of true positives per class/label + denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) positives per + class/label. + """ self._numerator: Union[int, torch.Tensor] = 0 self._denominator: Union[int, torch.Tensor] = 0 @@ -123,17 +124,20 @@ def reset(self) -> None: @sync_all_reduce("_numerator", "_denominator") def compute(self) -> Union[torch.Tensor, float]: + r""" + Return value of the metric for `average` options `'weighted'` and `'macro'` is computed as follows. + + .. math:: + \text{Precision/Recall} = \frac{ numerator }{ denominator } \cdot weight - # Return value of the metric for `average` options `'weighted'` and `'macro'` is computed as follows. - # - # .. math:: \text{Precision/Recall} = \frac{ numerator }{ denominator } \cdot weight - # - # wherein `weight` is the internal variable `weight` for `'weighted'` option and :math:`1/C` - # for the `macro` one. :math:`C` is the number of classes/labels. - # - # Return value of the metric for `average` options `'micro'`, `'samples'`, `False` and None is as follows. - # - # .. math:: \text{Precision/Recall} = \frac{ numerator }{ denominator } + wherein `weight` is the internal variable `_weight` for `'weighted'` option and :math:`1/C` + for the `macro` one. :math:`C` is the number of classes/labels. + + Return value of the metric for `average` options `'micro'`, `'samples'`, `False` and None is as follows. + + .. math:: + \text{Precision/Recall} = \frac{ numerator }{ denominator } + """ if not self._updated: raise NotComputableError( @@ -371,22 +375,46 @@ def thresholded_output_transform(output): @reinit__is_reduced def update(self, output: Sequence[torch.Tensor]) -> None: + r""" + Update the metric state using prediction and target. + + Args: + output: a binary tuple of tensors (y_pred, y) whose shapes follow the table below. N stands for the batch + dimension, `...` for possible additional dimensions and C for class dimension. + + .. list-table:: + :widths: 20 10 10 10 + :header-rows: 1 + + * - Output member\\Data type + - Binary + - Multiclass + - Multilabel + * - y_pred + - (N, ...) + - (N, C, ...) + - (N, C, ...) + * - y + - (N, ...) + - (N, ...) + - (N, C, ...) + + For binary and multilabel data, both y and y_pred should consist of 0's and 1's, but for multiclass + data, y_pred and y should consist of probabilities and integers respectively. + """ self._check_shape(output) self._check_type(output) y_pred, y, correct = self._prepare_output(output) if self._average == "samples": - all_positives = y_pred.sum(dim=1) true_positives = correct.sum(dim=1) self._numerator += torch.sum(true_positives / (all_positives + self.eps)) self._denominator += y.size(0) elif self._average == "micro": - self._denominator += y_pred.sum() self._numerator += correct.sum() else: # _average in [False, None, 'macro', 'weighted'] - self._denominator += y_pred.sum(dim=0) self._numerator += correct.sum(dim=0) diff --git a/ignite/metrics/psnr.py b/ignite/metrics/psnr.py index f4dc59669afc..4251a24f8f13 100644 --- a/ignite/metrics/psnr.py +++ b/ignite/metrics/psnr.py @@ -81,6 +81,8 @@ def get_y_channel(output): .. versionadded:: 0.4.3 """ + _state_dict_all_req_keys = ("_sum_of_batchwise_psnr", "_num_examples") + def __init__( self, data_range: Union[int, float], @@ -114,7 +116,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None: dim = tuple(range(1, y.ndim)) mse_error = torch.pow(y_pred.double() - y.view_as(y_pred).double(), 2).mean(dim=dim) - self._sum_of_batchwise_psnr += torch.sum(10.0 * torch.log10(self.data_range ** 2 / (mse_error + 1e-10))).to( + self._sum_of_batchwise_psnr += torch.sum(10.0 * torch.log10(self.data_range**2 / (mse_error + 1e-10))).to( device=self._device ) self._num_examples += y.shape[0] diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py index 77afdb8a1d1b..b570951e291f 100644 --- a/ignite/metrics/recall.py +++ b/ignite/metrics/recall.py @@ -221,17 +221,14 @@ def update(self, output: Sequence[torch.Tensor]) -> None: _, y, correct = self._prepare_output(output) if self._average == "samples": - actual_positives = y.sum(dim=1) true_positives = correct.sum(dim=1) self._numerator += torch.sum(true_positives / (actual_positives + self.eps)) self._denominator += y.size(0) elif self._average == "micro": - self._denominator += y.sum() self._numerator += correct.sum() else: # _average in [False, 'macro', 'weighted'] - self._denominator += y.sum(dim=0) self._numerator += correct.sum(dim=0) diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py index 468838a9908c..9b3b4efb4f3f 100644 --- a/ignite/metrics/running_average.py +++ b/ignite/metrics/running_average.py @@ -1,10 +1,11 @@ -from typing import Callable, cast, Optional, Sequence, Union +import warnings +from typing import Any, Callable, cast, Optional, Union import torch import ignite.distributed as idist from ignite.engine import Engine, Events -from ignite.metrics.metric import EpochWise, Metric, MetricUsage, reinit__is_reduced, sync_all_reduce +from ignite.metrics.metric import Metric, MetricUsage, reinit__is_reduced, RunningBatchWise, SingleEpochRunningBatchWise __all__ = ["RunningAverage"] @@ -18,8 +19,10 @@ class RunningAverage(Metric): alpha: running average decay factor, default 0.98 output_transform: a function to use to transform the output if `src` is None and corresponds the output of process function. Otherwise it should be None. - epoch_bound: whether the running average should be reset after each epoch (defaults - to True). + epoch_bound: whether the running average should be reset after each epoch. It is depracated in favor of + ``usage`` argument in :meth:`attach` method. Setting ``epoch_bound`` to ``False`` is equivalent to + ``usage=SingleEpochRunningBatchWise()`` and setting it to ``True`` is equivalent to + ``usage=RunningBatchWise()`` in the :meth:`attach` method. Default None. device: specifies which device updates are accumulated on. Should be None when ``src`` is an instance of :class:`~ignite.metrics.metric.Metric`, as the running average will use the ``src``'s device. Otherwise, defaults to CPU. Only applicable when the computed value @@ -84,13 +87,14 @@ def log_running_avg_metrics(): """ required_output_keys = None + _state_dict_all_req_keys = ("_value", "src") def __init__( self, src: Optional[Metric] = None, alpha: float = 0.98, output_transform: Optional[Callable] = None, - epoch_bound: bool = True, + epoch_bound: Optional[bool] = None, device: Optional[Union[str, torch.device]] = None, ): if not (isinstance(src, Metric) or src is None): @@ -101,11 +105,13 @@ def __init__( if isinstance(src, Metric): if output_transform is not None: raise ValueError("Argument output_transform should be None if src is a Metric.") + + def output_transform(x: Any) -> Any: + return x + if device is not None: raise ValueError("Argument device should be None if src is a Metric.") - self.src = src - self._get_src_value = self._get_metric_value - setattr(self, "iteration_completed", self._metric_iteration_completed) + self.src: Union[Metric, None] = src device = src._device else: if output_transform is None: @@ -113,56 +119,105 @@ def __init__( "Argument output_transform should not be None if src corresponds " "to the output of process function." ) - self._get_src_value = self._get_output_value - setattr(self, "update", self._output_update) + self.src = None if device is None: device = torch.device("cpu") - self.alpha = alpha + if epoch_bound is not None: + warnings.warn( + "`epoch_bound` is deprecated and will be removed in the future. Consider using `usage` argument of" + "`attach` method instead. `epoch_bound=True` is equivalent with `usage=SingleEpochRunningBatchWise()`" + " and `epoch_bound=False` is equivalent with `usage=RunningBatchWise()`." + ) self.epoch_bound = epoch_bound - super(RunningAverage, self).__init__(output_transform=output_transform, device=device) # type: ignore[arg-type] + self.alpha = alpha + super(RunningAverage, self).__init__(output_transform=output_transform, device=device) @reinit__is_reduced def reset(self) -> None: self._value: Optional[Union[float, torch.Tensor]] = None + if isinstance(self.src, Metric): + self.src.reset() @reinit__is_reduced - def update(self, output: Sequence) -> None: - # Implement abstract method - pass + def update(self, output: Union[torch.Tensor, float]) -> None: + if self.src is None: + output = output.detach().to(self._device, copy=True) if isinstance(output, torch.Tensor) else output + value = idist.all_reduce(output) / idist.get_world_size() + else: + value = self.src.compute() + self.src.reset() - def compute(self) -> Union[torch.Tensor, float]: if self._value is None: - self._value = self._get_src_value() + self._value = value else: - self._value = self._value * self.alpha + (1.0 - self.alpha) * self._get_src_value() - - return self._value + self._value = self._value * self.alpha + (1.0 - self.alpha) * value - def attach(self, engine: Engine, name: str, _usage: Union[str, MetricUsage] = EpochWise()) -> None: - if self.epoch_bound: - # restart average every epoch - engine.add_event_handler(Events.EPOCH_STARTED, self.started) - # compute metric - engine.add_event_handler(Events.ITERATION_COMPLETED, self.iteration_completed) - # apply running average - engine.add_event_handler(Events.ITERATION_COMPLETED, self.completed, name) - - def _get_metric_value(self) -> Union[torch.Tensor, float]: - return self.src.compute() - - @sync_all_reduce("src") - def _get_output_value(self) -> Union[torch.Tensor, float]: - # we need to compute average instead of sum produced by @sync_all_reduce("src") - output = cast(Union[torch.Tensor, float], self.src) / idist.get_world_size() - return output - - def _metric_iteration_completed(self, engine: Engine) -> None: - self.src.started(engine) - self.src.iteration_completed(engine) - - @reinit__is_reduced - def _output_update(self, output: Union[torch.Tensor, float]) -> None: - if isinstance(output, torch.Tensor): - output = output.detach().to(self._device, copy=True) - self.src = output # type: ignore[assignment] + def compute(self) -> Union[torch.Tensor, float]: + return cast(Union[torch.Tensor, float], self._value) + + def attach(self, engine: Engine, name: str, usage: Union[str, MetricUsage] = RunningBatchWise()) -> None: + r""" + Attach the metric to the ``engine`` using the events determined by the ``usage``. + + Args: + engine: the engine to get attached to. + name: by which, the metric is inserted into ``engine.state.metrics`` dictionary. + usage: the usage determining on which events the metric is reset, updated and computed. It should be an + instance of the :class:`~ignite.metrics.metric.MetricUsage`\ s in the following table. + + ======================================================= =========================================== + ``usage`` **class** **Description** + ======================================================= =========================================== + :class:`~.metrics.metric.RunningBatchWise` Running average of the ``src`` metric or + ``engine.state.output`` is computed across + batches. In the former case, on each batch, + ``src`` is reset, updated and computed then + its value is retrieved. Default. + :class:`~.metrics.metric.SingleEpochRunningBatchWise` Same as above but the running average is + computed across batches in an epoch so it + is reset at the end of the epoch. + :class:`~.metrics.metric.RunningEpochWise` Running average of the ``src`` metric or + ``engine.state.output`` is computed across + epochs. In the former case, ``src`` works + as if it was attached in a + :class:`~ignite.metrics.metric.EpochWise` + manner and its computed value is retrieved + at the end of the epoch. The latter case + doesn't make much sense for this usage as + the ``engine.state.output`` of the last + batch is retrieved then. + ======================================================= =========================================== + + ``RunningAverage`` retrieves ``engine.state.output`` at ``usage.ITERATION_COMPLETED`` if the ``src`` is not + given and it's computed and updated using ``src``, by manually calling its ``compute`` method, or + ``engine.state.output`` at ``usage.COMPLETED`` event. + Also if ``src`` is given, it is updated at ``usage.ITERATION_COMPLETED``, but its reset event is determined by + ``usage`` type. If ``isinstance(usage, BatchWise)`` holds true, ``src`` is reset on ``BatchWise().STARTED``, + otherwise on ``EpochWise().STARTED`` if ``isinstance(usage, EpochWise)``. + + .. versionchanged:: 0.5.1 + Added `usage` argument + """ + usage = self._check_usage(usage) + if self.epoch_bound is not None: + usage = SingleEpochRunningBatchWise() if self.epoch_bound else RunningBatchWise() + + if isinstance(self.src, Metric) and not engine.has_event_handler( + self.src.iteration_completed, Events.ITERATION_COMPLETED + ): + engine.add_event_handler(Events.ITERATION_COMPLETED, self.src.iteration_completed) + + super().attach(engine, name, usage) + + def detach(self, engine: Engine, usage: Union[str, MetricUsage] = RunningBatchWise()) -> None: + usage = self._check_usage(usage) + if self.epoch_bound is not None: + usage = SingleEpochRunningBatchWise() if self.epoch_bound else RunningBatchWise() + + if isinstance(self.src, Metric) and engine.has_event_handler( + self.src.iteration_completed, Events.ITERATION_COMPLETED + ): + engine.remove_event_handler(self.src.iteration_completed, Events.ITERATION_COMPLETED) + + super().detach(engine, usage) diff --git a/ignite/metrics/ssim.py b/ignite/metrics/ssim.py index 805024c0fd23..6824c0b3f374 100644 --- a/ignite/metrics/ssim.py +++ b/ignite/metrics/ssim.py @@ -1,4 +1,5 @@ -from typing import Callable, Sequence, Union +import warnings +from typing import Callable, Optional, Sequence, Union import torch import torch.nn.functional as F @@ -11,9 +12,12 @@ class SSIM(Metric): """ - Computes Structual Similarity Index Measure + Computes Structural Similarity Index Measure - - ``update`` must receive output of the form ``(y_pred, y)``. + - ``update`` must receive output of the form ``(y_pred, y)``. They have to be of the same type. + Valid :class:`torch.dtype` are the following: + - on CPU: `torch.float32`, `torch.float64`. + - on CUDA: `torch.float16`, `torch.bfloat16`, `torch.float32`, `torch.float64`. Args: data_range: Range of the image. Typically, ``1.0`` or ``255``. @@ -60,6 +64,8 @@ class SSIM(Metric): .. versionadded:: 0.4.2 """ + _state_dict_all_req_keys = ("_sum_of_ssim", "_num_examples", "_kernel") + def __init__( self, data_range: Union[int, float], @@ -93,27 +99,27 @@ def __init__( super(SSIM, self).__init__(output_transform=output_transform, device=device) self.gaussian = gaussian + self.data_range = data_range self.c1 = (k1 * data_range) ** 2 self.c2 = (k2 * data_range) ** 2 self.pad_h = (self.kernel_size[0] - 1) // 2 self.pad_w = (self.kernel_size[1] - 1) // 2 - self._kernel = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma) + self._kernel_2d = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma) + self._kernel: Optional[torch.Tensor] = None @reinit__is_reduced def reset(self) -> None: self._sum_of_ssim = torch.tensor(0.0, dtype=torch.float64, device=self._device) self._num_examples = 0 - self._kernel = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma) def _uniform(self, kernel_size: int) -> torch.Tensor: - max, min = 2.5, -2.5 - ksize_half = (kernel_size - 1) * 0.5 - kernel = torch.linspace(-ksize_half, ksize_half, steps=kernel_size, device=self._device) - for i, j in enumerate(kernel): - if min <= j <= max: - kernel[i] = 1 / (max - min) - else: - kernel[i] = 0 + kernel = torch.zeros(kernel_size) + + start_uniform_index = max(kernel_size // 2 - 2, 0) + end_uniform_index = min(kernel_size // 2 + 3, kernel_size) + + min_, max_ = -2.5, 2.5 + kernel[start_uniform_index:end_uniform_index] = 1 / (max_ - min_) return kernel.unsqueeze(dim=0) # (1, kernel_size) @@ -152,15 +158,37 @@ def update(self, output: Sequence[torch.Tensor]) -> None: f"Expected y_pred and y to have BxCxHxW shape. Got y_pred: {y_pred.shape} and y: {y.shape}." ) - channel = y_pred.size(1) - if len(self._kernel.shape) < 4: - self._kernel = self._kernel.expand(channel, 1, -1, -1).to(device=y_pred.device) + # converts potential integer tensor to fp + if not y.is_floating_point(): + y = y.float() + if not y_pred.is_floating_point(): + y_pred = y_pred.float() + + nb_channel = y_pred.size(1) + if self._kernel is None or self._kernel.shape[0] != nb_channel: + self._kernel = self._kernel_2d.expand(nb_channel, 1, -1, -1) + + if y_pred.device != self._kernel.device: + if self._kernel.device == torch.device("cpu"): + self._kernel = self._kernel.to(device=y_pred.device) + + elif y_pred.device == torch.device("cpu"): + warnings.warn( + "y_pred tensor is on cpu device but previous computation was on another device: " + f"{self._kernel.device}. To avoid having a performance hit, please ensure that all " + "y and y_pred tensors are on the same device.", + ) + y_pred = y_pred.to(device=self._kernel.device) + y = y.to(device=self._kernel.device) y_pred = F.pad(y_pred, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect") y = F.pad(y, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect") + if y_pred.dtype != self._kernel.dtype: + self._kernel = self._kernel.to(dtype=y_pred.dtype) + input_list = [y_pred, y, y_pred * y_pred, y * y, y_pred * y] - outputs = F.conv2d(torch.cat(input_list), self._kernel, groups=channel) + outputs = F.conv2d(torch.cat(input_list), self._kernel, groups=nb_channel) batch_size = y_pred.size(0) output_list = [outputs[x * batch_size : (x + 1) * batch_size] for x in range(len(input_list))] @@ -178,7 +206,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None: b2 = sigma_pred_sq + sigma_target_sq + self.c2 ssim_idx = (a1 * a2) / (b1 * b2) - self._sum_of_ssim += torch.mean(ssim_idx, (1, 2, 3), dtype=torch.float64).sum().to(self._device) + self._sum_of_ssim += torch.mean(ssim_idx, (1, 2, 3), dtype=torch.float64).sum().to(device=self._device) self._num_examples += y.shape[0] diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py index 8f4515201976..87da4c868731 100644 --- a/ignite/metrics/top_k_categorical_accuracy.py +++ b/ignite/metrics/top_k_categorical_accuracy.py @@ -73,6 +73,8 @@ def one_hot_to_binary_output_transform(output): 0.75 """ + _state_dict_all_req_keys = ("_num_correct", "_num_examples") + def __init__( self, k: int = 5, diff --git a/ignite/utils.py b/ignite/utils.py index fffe2c1b3a9a..078e16663f68 100644 --- a/ignite/utils.py +++ b/ignite/utils.py @@ -6,7 +6,7 @@ import shutil import warnings from pathlib import Path -from typing import Any, Callable, cast, Dict, Optional, TextIO, Tuple, Type, TypeVar, Union +from typing import Any, Callable, cast, Dict, List, Optional, TextIO, Tuple, Type, TypeVar, Union import torch @@ -78,6 +78,66 @@ def apply_to_type( raise TypeError((f"x must contain {input_type}, dicts or lists; found {type(x)}")) +def _tree_map( + func: Callable, x: Union[Any, collections.Sequence, collections.Mapping], key: Optional[Union[int, str]] = None +) -> Union[Any, collections.Sequence, collections.Mapping]: + if isinstance(x, collections.Mapping): + return cast(Callable, type(x))({k: _tree_map(func, sample, key=k) for k, sample in x.items()}) + if isinstance(x, tuple) and hasattr(x, "_fields"): # namedtuple + return cast(Callable, type(x))(*(_tree_map(func, sample) for sample in x)) + if isinstance(x, collections.Sequence): + return cast(Callable, type(x))([_tree_map(func, sample, key=i) for i, sample in enumerate(x)]) + return func(x, key=key) + + +class _CollectionItem: + types_as_collection_item: Tuple = (int, float, torch.Tensor) + + def __init__(self, collection: Union[Dict, List], key: Union[int, str]) -> None: + if not isinstance(collection, (dict, list)): + raise TypeError( + f"Input type is expected to be a mapping or list, but got {type(collection)} " f"for input key '{key}'." + ) + if isinstance(collection, list) and isinstance(key, str): + raise ValueError("Key should be int for collection of type list") + + self.collection = collection + self.key = key + + def load_value(self, value: Any) -> None: + self.collection[self.key] = value # type: ignore[index] + + def value(self) -> Any: + return self.collection[self.key] # type: ignore[index] + + @staticmethod + def wrap(object: Union[Dict, List], key: Union[int, str], value: Any) -> Union[Any, "_CollectionItem"]: + return ( + _CollectionItem(object, key) + if value is None or isinstance(value, _CollectionItem.types_as_collection_item) + else value + ) + + +def _tree_apply2( + func: Callable, + x: Union[Any, List, Dict], + y: Union[Any, collections.Sequence, collections.Mapping], +) -> None: + if isinstance(x, dict) and isinstance(y, collections.Mapping): + for k, v in x.items(): + if k not in y: + raise ValueError(f"Key '{k}' from x is not found in y: {y.keys()}") + _tree_apply2(func, _CollectionItem.wrap(x, k, v), y[k]) + elif isinstance(x, list) and isinstance(y, collections.Sequence): + if len(x) != len(y): + raise ValueError(f"Size of y: {len(y)} does not match the size of x: '{len(x)}'") + for i, (v1, v2) in enumerate(zip(x, y)): + _tree_apply2(func, _CollectionItem.wrap(x, i, v1), v2) + else: + return func(x, y) + + def to_onehot(indices: torch.Tensor, num_classes: int) -> torch.Tensor: """Convert a tensor of indices of any shape `(N, ...)` to a tensor of one-hot indicators of shape `(N, num_classes, ...)` and of type uint8. Output's device is equal to the @@ -182,13 +242,11 @@ def setup_logger( # Remove previous handlers if distributed_rank > 0 or reset: - if logger.hasHandlers(): for h in list(logger.handlers): logger.removeHandler(h) if distributed_rank > 0: - # Add null handler to avoid multiple parallel messages logger.addHandler(logging.NullHandler()) @@ -254,7 +312,6 @@ def manual_seed(seed: int) -> None: def deprecated( deprecated_in: str, removed_in: str = "", reasons: Tuple[str, ...] = (), raise_exception: bool = False ) -> Callable: - F = TypeVar("F", bound=Callable[..., Any]) def decorator(func: F) -> F: diff --git a/mypy.ini b/mypy.ini index 489b3a3fd28c..bf91c5787738 100644 --- a/mypy.ini +++ b/mypy.ini @@ -77,3 +77,6 @@ ignore_missing_imports = True [mypy-torchvision.*] ignore_missing_imports = True + +[mypy-ignite.contrib.handlers.custom_events] +ignore_errors = True diff --git a/requirements-dev.txt b/requirements-dev.txt index 182a4057bc17..cc5db446522b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,12 +12,8 @@ tqdm scikit-learn matplotlib tensorboardX -visdom==0.2.3 -# temporary fix for -# ImportError: cannot import name 'soft_unicode' from 'markupsafe' -markupsafe==2.0.1 +visdom polyaxon -polyaxon-client wandb mlflow neptune-client>=0.16.17 @@ -31,4 +27,4 @@ nltk # Examples dependencies pandas gymnasium -mkl +mkl;platform_machine=="x86_64" diff --git a/setup.cfg b/setup.cfg index 73947364b717..9a6e4158fa7b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -license_file = LICENSE +license_files = LICENSE [pycodestyle] exclude = .eggs,*.egg,build,docs/*,.git,versioneer.py,*/conf.py diff --git a/setup.py b/setup.py index 15fdb830a875..29b02c6fac6a 100644 --- a/setup.py +++ b/setup.py @@ -30,8 +30,8 @@ def find_version(*file_paths): # Metadata name="pytorch-ignite", version=VERSION, - author="PyTorch Core Team", - author_email="soumith@pytorch.org", + author="PyTorch-Ignite Team", + author_email="contact@pytorch-ignite.ai", url="https://github.com/pytorch/ignite", description="A lightweight library to help with training neural networks in PyTorch.", long_description_content_type="text/markdown", diff --git a/tests/ignite/base/test_mixins.py b/tests/ignite/base/test_mixins.py index a929e4c7eb89..0f3a39811fbb 100644 --- a/tests/ignite/base/test_mixins.py +++ b/tests/ignite/base/test_mixins.py @@ -10,6 +10,5 @@ def test_state_dict(): def test_load_state_dict(): - s = Serializable() s.load_state_dict({}) diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index 978dfade2fbf..caf92e6e7ad2 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -83,7 +83,6 @@ def local_rank(worker_id): @pytest.fixture(scope="module") def world_size(): - remove_env_var = False if "WORLD_SIZE" not in os.environ: @@ -98,14 +97,12 @@ def world_size(): @pytest.fixture() def clean_env(): - for k in ["RANK", "LOCAL_RANK", "WORLD_SIZE"]: if k in os.environ: del os.environ[k] def _create_dist_context(dist_info, lrank): - dist.init_process_group(**dist_info) dist.barrier() if torch.cuda.is_available(): @@ -115,7 +112,6 @@ def _create_dist_context(dist_info, lrank): def _destroy_dist_context(): - if dist.get_rank() == 0: # To support Python 3.7; Otherwise we could do `.unlink(missing_ok=True)` try: @@ -145,7 +141,6 @@ def _find_free_port(): def _setup_free_port(local_rank): - port_file = "/tmp/free_port" if local_rank == 0: @@ -169,7 +164,6 @@ def _setup_free_port(local_rank): @pytest.fixture() def distributed_context_single_node_nccl(local_rank, world_size): - free_port = _setup_free_port(local_rank) dist_info = { @@ -184,7 +178,6 @@ def distributed_context_single_node_nccl(local_rank, world_size): @pytest.fixture() def distributed_context_single_node_gloo(local_rank, world_size): - from datetime import timedelta if sys.platform.startswith("win"): @@ -212,7 +205,6 @@ def distributed_context_single_node_gloo(local_rank, world_size): @pytest.fixture() def multi_node_conf(local_rank): - assert "node_id" in os.environ assert "nnodes" in os.environ assert "nproc_per_node" in os.environ @@ -229,7 +221,6 @@ def multi_node_conf(local_rank): def _create_mnodes_dist_context(dist_info, mnodes_conf): - dist.init_process_group(**dist_info) dist.barrier() if torch.cuda.is_available(): @@ -249,7 +240,6 @@ def _destroy_mnodes_dist_context(): @pytest.fixture() def distributed_context_multi_node_gloo(multi_node_conf): - assert "MASTER_ADDR" in os.environ assert "MASTER_PORT" in os.environ @@ -265,7 +255,6 @@ def distributed_context_multi_node_gloo(multi_node_conf): @pytest.fixture() def distributed_context_multi_node_nccl(multi_node_conf): - assert "MASTER_ADDR" in os.environ assert "MASTER_PORT" in os.environ @@ -289,7 +278,6 @@ def _xla_template_worker_task(index, fn, args): def _xla_execute(fn, args, nprocs): - import torch_xla.distributed.xla_multiprocessing as xmp spawn_kwargs = {} @@ -409,6 +397,7 @@ def gloo_hvd_executor(): ], ), ], + scope="class", ) def distributed(request, local_rank, world_size): if request.param in ("nccl", "gloo_cpu", "gloo"): diff --git a/tests/ignite/contrib/conftest.py b/tests/ignite/contrib/conftest.py index 0f63292239e5..9c9b15d8699e 100644 --- a/tests/ignite/contrib/conftest.py +++ b/tests/ignite/contrib/conftest.py @@ -25,7 +25,6 @@ def no_site_packages(request): @pytest.fixture() def visdom_offline_logfile(dirname): - log_file = dirname / "logs.visdom" yield log_file @@ -41,7 +40,6 @@ def visdom_server(): global vd_hostname, vd_port, vd_server_process if vd_server_process is None: - import subprocess import time @@ -73,7 +71,6 @@ def visdom_server(): @pytest.fixture() def visdom_server_stop(): - yield None import time diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index 4749d5db1086..d25cb33caceb 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -164,6 +164,9 @@ def test_asserts_setup_common_training_handlers(): ) trainer.run([1]) + with pytest.warns(UserWarning, match=r"Argument device is unused and deprecated"): + setup_common_training_handlers(trainer, device="cpu") + def test_no_warning_with_train_sampler(recwarn): from torch.utils.data import RandomSampler diff --git a/tests/ignite/contrib/handlers/test_base_logger.py b/tests/ignite/contrib/handlers/test_base_logger.py index 8ec6b832f26b..1e63e490da2e 100644 --- a/tests/ignite/contrib/handlers/test_base_logger.py +++ b/tests/ignite/contrib/handlers/test_base_logger.py @@ -1,9 +1,11 @@ +import math from typing import Any, Union from unittest.mock import call, MagicMock import pytest import torch +from ignite.contrib.handlers import CustomPeriodicEvent from ignite.contrib.handlers.base_logger import ( BaseLogger, BaseOptimizerParamsHandler, @@ -49,7 +51,6 @@ def __call__(self, engine: Engine, logger: Any, event_name: Union[str, Events]) def test_base_output_handler_wrong_setup(): - with pytest.raises(TypeError, match="metric_names should be either a list or equal 'all'"): DummyOutputHandler("tag", metric_names="abc", output_transform=None) @@ -67,7 +68,6 @@ def test_base_output_handler_wrong_setup(): def test_base_output_handler_setup_output_metrics(): - engine = Engine(lambda engine, batch: None) true_metrics = {"a": 0, "b": 1} engine.state = State(metrics=true_metrics) @@ -183,7 +183,6 @@ def test_opt_params_handler_on_non_torch_optimizers(): ], ) def test_attach(event, n_calls, kwargs): - n_epochs = 5 data = list(range(50)) @@ -218,7 +217,6 @@ def update_fn(engine, batch): def test_attach_wrong_event_name(): - trainer = Engine(lambda b, e: None) logger = DummyLogger() mock_log_handler = MagicMock() @@ -238,7 +236,6 @@ def test_attach_on_custom_event(): data = list(range(150)) def _test(event, n_calls, cpe): - losses = torch.rand(n_epochs * len(data)) losses_iter = iter(losses) @@ -259,6 +256,33 @@ def update_fn(engine, batch): mock_log_handler.assert_called_with(trainer, logger, event) assert mock_log_handler.call_count == n_calls + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_iterations = 10 + cpe1 = CustomPeriodicEvent(n_iterations=n_iterations) + n = len(data) * n_epochs / n_iterations + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe1.Events.ITERATIONS_10_STARTED, ns, cpe1) + _test(cpe1.Events.ITERATIONS_10_COMPLETED, nf, cpe1) + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_iterations = 15 + cpe2 = CustomPeriodicEvent(n_iterations=n_iterations) + n = len(data) * n_epochs / n_iterations + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe2.Events.ITERATIONS_15_STARTED, ns, cpe2) + _test(cpe2.Events.ITERATIONS_15_COMPLETED, nf, cpe2) + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_custom_epochs = 2 + cpe3 = CustomPeriodicEvent(n_epochs=n_custom_epochs) + n = n_epochs / n_custom_epochs + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe3.Events.EPOCHS_2_STARTED, ns, cpe3) + _test(cpe3.Events.EPOCHS_2_COMPLETED, nf, cpe3) + @pytest.mark.parametrize( "event, n_calls", @@ -273,7 +297,6 @@ def update_fn(engine, batch): ], ) def test_as_context_manager(event, n_calls): - n_epochs = 5 data = list(range(50)) @@ -313,13 +336,11 @@ def update_fn(engine, batch): def test_base_weights_handler_wrong_setup(): - with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"): DummyWeightsHandler(None) def test_base_weights_scalar_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) with pytest.raises(TypeError, match="Argument reduction should be callable"): DummyWeightsScalarHandler(model, reduction=123) diff --git a/tests/ignite/contrib/handlers/test_clearml_logger.py b/tests/ignite/contrib/handlers/test_clearml_logger.py index 2e4968f40fa5..9f29d2ba8ebb 100644 --- a/tests/ignite/contrib/handlers/test_clearml_logger.py +++ b/tests/ignite/contrib/handlers/test_clearml_logger.py @@ -44,7 +44,6 @@ def test_no_clearml(): def test_optimizer_params_handler_wrong_setup(): - with pytest.raises(TypeError): OptimizerParamsHandler(optimizer=None) @@ -58,7 +57,6 @@ def test_optimizer_params_handler_wrong_setup(): def test_optimizer_params(): - optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") mock_logger = MagicMock(spec=ClearMLLogger) @@ -81,7 +79,6 @@ def test_optimizer_params(): def test_output_handler_with_wrong_logger_type(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock() @@ -91,7 +88,6 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(dirname): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock(spec=ClearMLLogger) mock_logger.clearml_logger = MagicMock() @@ -118,7 +114,6 @@ def test_output_handler_output_transform(dirname): def test_output_handler_metric_names(dirname): - wrapper = OutputHandler("tag", metric_names=["a", "b"]) mock_logger = MagicMock(spec=ClearMLLogger) mock_logger.clearml_logger = MagicMock() @@ -216,7 +211,6 @@ def test_output_handler_metric_names(dirname): def test_output_handler_both(dirname): - wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) mock_logger = MagicMock(spec=ClearMLLogger) mock_logger.clearml_logger = MagicMock() @@ -257,7 +251,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -340,7 +333,6 @@ def global_step_transform(*args, **kwargs): def test_weights_scalar_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = WeightsScalarHandler(model) mock_logger = MagicMock() @@ -350,7 +342,6 @@ def test_weights_scalar_handler_wrong_setup(): def test_weights_scalar_handler(dummy_model_factory): - model = dummy_model_factory(with_grads=True, with_frozen_layer=False) # define test wrapper to test with and without optional tag @@ -429,7 +420,6 @@ def weight_selector(n, _): def test_weights_hist_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = WeightsHistHandler(model) mock_logger = MagicMock() @@ -439,7 +429,6 @@ def test_weights_hist_handler_wrong_setup(): def test_weights_hist_handler(dummy_model_factory): - model = dummy_model_factory(with_grads=True, with_frozen_layer=False) # define test wrapper to test with and without optional tag @@ -518,7 +507,6 @@ def weight_selector(n, _): def test_grads_scalar_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = GradsScalarHandler(model) mock_logger = MagicMock() @@ -612,7 +600,6 @@ def weight_selector(n, _): def test_grads_hist_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = GradsHistHandler(model) mock_logger = MagicMock() @@ -700,7 +687,6 @@ def weight_selector(n, _): def test_integration(dirname): - n_epochs = 5 data = list(range(50)) @@ -728,7 +714,6 @@ def dummy_handler(engine, logger, event_name): def test_integration_as_context_manager(dirname): - n_epochs = 5 data = list(range(50)) @@ -741,7 +726,6 @@ def update_fn(engine, batch): with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"): ClearMLLogger.set_bypass_mode(True) with ClearMLLogger(output_uri=dirname) as clearml_logger: - trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): @@ -755,7 +739,6 @@ def dummy_handler(engine, logger, event_name): def test_clearml_logger_getattr_method(dirname): - with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"): ClearMLLogger.set_bypass_mode(True) @@ -777,7 +760,6 @@ def test_clearml_logger_getattr_method(dirname): def test_clearml_logger_get_task_bypass(dirname): - with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"): ClearMLLogger.set_bypass_mode(True) @@ -873,7 +855,6 @@ def test_clearml_saver_callbacks(): n_saved = 2 for i, (filename, metadata) in enumerate(zip(filenames, metadata_list)): - mock_model_info.upload_filename = filename if i >= n_saved: @@ -927,7 +908,6 @@ def forward(self, x): def _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=False): - if idist.get_rank() == 0: clearml.Task.current_task = MagicMock(spec=clearml.Task) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock() @@ -1016,7 +996,6 @@ def update_fn(engine, batch): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_save_model_optimizer_lr_scheduler_with_state_dict(device) _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) @@ -1026,7 +1005,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_save_model_optimizer_lr_scheduler_with_state_dict(device) _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) diff --git a/tests/ignite/contrib/handlers/test_custom_events.py b/tests/ignite/contrib/handlers/test_custom_events.py new file mode 100644 index 000000000000..90decc1e2983 --- /dev/null +++ b/tests/ignite/contrib/handlers/test_custom_events.py @@ -0,0 +1,132 @@ +import math + +import pytest + +from ignite.contrib.handlers.custom_events import CustomPeriodicEvent +from ignite.engine import Engine + + +def test_bad_input(): + with pytest.warns(DeprecationWarning, match=r"CustomPeriodicEvent is deprecated"): + with pytest.raises(TypeError, match="Argument n_iterations should be an integer"): + CustomPeriodicEvent(n_iterations="a") + with pytest.raises(ValueError, match="Argument n_iterations should be positive"): + CustomPeriodicEvent(n_iterations=0) + with pytest.raises(TypeError, match="Argument n_iterations should be an integer"): + CustomPeriodicEvent(n_iterations=10.0) + with pytest.raises(TypeError, match="Argument n_epochs should be an integer"): + CustomPeriodicEvent(n_epochs="a") + with pytest.raises(ValueError, match="Argument n_epochs should be positive"): + CustomPeriodicEvent(n_epochs=0) + with pytest.raises(TypeError, match="Argument n_epochs should be an integer"): + CustomPeriodicEvent(n_epochs=10.0) + with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"): + CustomPeriodicEvent() + with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"): + CustomPeriodicEvent(n_iterations=1, n_epochs=2) + + +def test_new_events(): + def update(*args, **kwargs): + pass + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + engine = Engine(update) + cpe = CustomPeriodicEvent(n_iterations=5) + cpe.attach(engine) + + assert hasattr(cpe, "Events") + assert hasattr(cpe.Events, "ITERATIONS_5_STARTED") + assert hasattr(cpe.Events, "ITERATIONS_5_COMPLETED") + + assert engine._allowed_events[-2] == getattr(cpe.Events, "ITERATIONS_5_STARTED") + assert engine._allowed_events[-1] == getattr(cpe.Events, "ITERATIONS_5_COMPLETED") + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_epochs=5) + cpe.attach(engine) + + assert hasattr(cpe, "Events") + assert hasattr(cpe.Events, "EPOCHS_5_STARTED") + assert hasattr(cpe.Events, "EPOCHS_5_COMPLETED") + + assert engine._allowed_events[-2] == getattr(cpe.Events, "EPOCHS_5_STARTED") + assert engine._allowed_events[-1] == getattr(cpe.Events, "EPOCHS_5_COMPLETED") + + +def test_integration_iterations(): + def _test(n_iterations, max_epochs, n_iters_per_epoch): + def update(*args, **kwargs): + pass + + engine = Engine(update) + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_iterations=n_iterations) + cpe.attach(engine) + data = list(range(n_iters_per_epoch)) + + custom_period = [0] + n_calls_iter_started = [0] + n_calls_iter_completed = [0] + + event_started = getattr(cpe.Events, "ITERATIONS_{}_STARTED".format(n_iterations)) + + @engine.on(event_started) + def on_my_event_started(engine): + assert (engine.state.iteration - 1) % n_iterations == 0 + custom_period[0] += 1 + custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations)) + assert custom_iter == custom_period[0] + n_calls_iter_started[0] += 1 + + event_completed = getattr(cpe.Events, "ITERATIONS_{}_COMPLETED".format(n_iterations)) + + @engine.on(event_completed) + def on_my_event_ended(engine): + assert engine.state.iteration % n_iterations == 0 + custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations)) + assert custom_iter == custom_period[0] + n_calls_iter_completed[0] += 1 + + engine.run(data, max_epochs=max_epochs) + + n = len(data) * max_epochs / n_iterations + nf = math.floor(n) + assert custom_period[0] == n_calls_iter_started[0] + assert n_calls_iter_started[0] == nf + 1 if nf < n else nf + assert n_calls_iter_completed[0] == nf + + _test(3, 5, 16) + _test(4, 5, 16) + _test(5, 5, 16) + _test(300, 50, 1000) + + +def test_integration_epochs(): + def update(*args, **kwargs): + pass + + engine = Engine(update) + + n_epochs = 3 + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_epochs=n_epochs) + cpe.attach(engine) + data = list(range(16)) + + custom_period = [1] + + @engine.on(cpe.Events.EPOCHS_3_STARTED) + def on_my_epoch_started(engine): + assert (engine.state.epoch - 1) % n_epochs == 0 + assert engine.state.epochs_3 == custom_period[0] + + @engine.on(cpe.Events.EPOCHS_3_COMPLETED) + def on_my_epoch_ended(engine): + assert engine.state.epoch % n_epochs == 0 + assert engine.state.epochs_3 == custom_period[0] + custom_period[0] += 1 + + engine.run(data, max_epochs=10) + + assert custom_period[0] == 4 diff --git a/tests/ignite/contrib/handlers/test_mlflow_logger.py b/tests/ignite/contrib/handlers/test_mlflow_logger.py index 4b8966881673..04bed3e7b912 100644 --- a/tests/ignite/contrib/handlers/test_mlflow_logger.py +++ b/tests/ignite/contrib/handlers/test_mlflow_logger.py @@ -14,7 +14,6 @@ def test_output_handler_with_wrong_logger_type(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock() @@ -24,7 +23,6 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock(spec=MLflowLogger) mock_logger.log_metrics = MagicMock() @@ -47,7 +45,6 @@ def test_output_handler_output_transform(): def test_output_handler_metric_names(): - wrapper = OutputHandler("tag", metric_names=["a", "b", "c"]) mock_logger = MagicMock(spec=MLflowLogger) mock_logger.log_metrics = MagicMock() @@ -94,7 +91,6 @@ def test_output_handler_metric_names(): def test_output_handler_both(): - wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) mock_logger = MagicMock(spec=MLflowLogger) mock_logger.log_metrics = MagicMock() @@ -145,7 +141,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -201,7 +196,6 @@ def test_output_handler_state_attrs(): def test_optimizer_params_handler_wrong_setup(): - with pytest.raises(TypeError): OptimizerParamsHandler(optimizer=None) @@ -215,7 +209,6 @@ def test_optimizer_params_handler_wrong_setup(): def test_optimizer_params(): - optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") mock_logger = MagicMock(spec=MLflowLogger) @@ -237,7 +230,6 @@ def test_optimizer_params(): @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows") def test_integration(dirname): - n_epochs = 5 data = list(range(50)) @@ -279,7 +271,6 @@ def dummy_handler(engine, logger, event_name): @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows") def test_integration_as_context_manager(dirname): - n_epochs = 5 data = list(range(50)) @@ -292,7 +283,6 @@ def update_fn(engine, batch): true_values = [] with MLflowLogger(str(dirname / "mlruns")) as mlflow_logger: - trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): @@ -324,7 +314,6 @@ def test_mlflow_bad_metric_name_handling(dirname): true_values = [123.0, 23.4, 333.4] with MLflowLogger(str(dirname / "mlruns")) as mlflow_logger: - active_run = mlflow.active_run() handler = OutputHandler(tag="training", metric_names="all") @@ -332,7 +321,6 @@ def test_mlflow_bad_metric_name_handling(dirname): engine.state = State(metrics={"metric:0 in %": 123.0, "metric 0": 1000.0}) with pytest.warns(UserWarning, match=r"MLflowLogger output_handler encountered an invalid metric name"): - engine.state.epoch = 1 handler(engine, mlflow_logger, event_name=Events.EPOCH_COMPLETED) @@ -352,6 +340,5 @@ def test_mlflow_bad_metric_name_handling(dirname): @pytest.mark.parametrize("no_site_packages", ["mlflow"], indirect=True) def test_no_mlflow_client(no_site_packages): - with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires mlflow to be installed."): MLflowLogger() diff --git a/tests/ignite/contrib/handlers/test_neptune_logger.py b/tests/ignite/contrib/handlers/test_neptune_logger.py index 4a428b14eff0..84d91c75577e 100644 --- a/tests/ignite/contrib/handlers/test_neptune_logger.py +++ b/tests/ignite/contrib/handlers/test_neptune_logger.py @@ -488,7 +488,6 @@ def dummy_handler(engine, logger, event_name): def test_neptune_saver_serializable(dirname): - mock_logger = MagicMock(spec=NeptuneLogger) mock_logger.upload = MagicMock() model = torch.nn.Module() @@ -503,7 +502,6 @@ def test_neptune_saver_serializable(dirname): @pytest.mark.parametrize("model, serializable", [(lambda x: x, False), (torch.nn.Module().to("cpu"), True)]) def test_neptune_saver(model, serializable): - mock_logger = MagicMock(spec=NeptuneLogger) mock_logger.upload = MagicMock() diff --git a/tests/ignite/contrib/handlers/test_polyaxon_logger.py b/tests/ignite/contrib/handlers/test_polyaxon_logger.py index 940a3c838d96..1d025da036ca 100644 --- a/tests/ignite/contrib/handlers/test_polyaxon_logger.py +++ b/tests/ignite/contrib/handlers/test_polyaxon_logger.py @@ -16,7 +16,6 @@ def test_output_handler_with_wrong_logger_type(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock() @@ -26,7 +25,6 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock(spec=PolyaxonLogger) mock_logger.log_metrics = MagicMock() @@ -49,7 +47,6 @@ def test_output_handler_output_transform(): def test_output_handler_metric_names(): - wrapper = OutputHandler("tag", metric_names=["a", "b", "c"]) mock_logger = MagicMock(spec=PolyaxonLogger) mock_logger.log_metrics = MagicMock() @@ -110,7 +107,6 @@ def test_output_handler_metric_names(): def test_output_handler_both(): - wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) mock_logger = MagicMock(spec=PolyaxonLogger) mock_logger.log_metrics = MagicMock() @@ -161,7 +157,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -217,7 +212,6 @@ def test_output_handler_state_attrs(): def test_optimizer_params_handler_wrong_setup(): - with pytest.raises(TypeError): OptimizerParamsHandler(optimizer=None) @@ -231,7 +225,6 @@ def test_optimizer_params_handler_wrong_setup(): def test_optimizer_params(): - optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") mock_logger = MagicMock(spec=PolyaxonLogger) @@ -252,7 +245,6 @@ def test_optimizer_params(): def test_integration(): - n_epochs = 5 data = list(range(50)) @@ -277,7 +269,6 @@ def dummy_handler(engine, logger, event_name): def test_integration_as_context_manager(): - n_epochs = 5 data = list(range(50)) @@ -288,7 +279,6 @@ def update_fn(engine, batch): return next(losses_iter) with PolyaxonLogger() as plx_logger: - trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): @@ -302,6 +292,5 @@ def dummy_handler(engine, logger, event_name): @pytest.mark.parametrize("no_site_packages", ["polyaxon"], indirect=True) def test_no_polyaxon_client(no_site_packages): - with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires polyaxon"): PolyaxonLogger() diff --git a/tests/ignite/contrib/handlers/test_tensorboard_logger.py b/tests/ignite/contrib/handlers/test_tensorboard_logger.py index 60c8a1f4483c..7effd41f046a 100644 --- a/tests/ignite/contrib/handlers/test_tensorboard_logger.py +++ b/tests/ignite/contrib/handlers/test_tensorboard_logger.py @@ -19,7 +19,6 @@ def test_optimizer_params_handler_wrong_setup(): - with pytest.raises(TypeError): OptimizerParamsHandler(optimizer=None) @@ -44,7 +43,6 @@ def test_getattr_method(): def test_optimizer_params(): - optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") mock_logger = MagicMock(spec=TensorboardLogger) @@ -65,7 +63,6 @@ def test_optimizer_params(): def test_output_handler_with_wrong_logger_type(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock() @@ -75,7 +72,6 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() @@ -98,7 +94,6 @@ def test_output_handler_output_transform(): def test_output_handler_metric_names(): - wrapper = OutputHandler("tag", metric_names=["a", "b"]) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() @@ -176,7 +171,6 @@ def test_output_handler_metric_names(): def test_output_handler_both(): - wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) mock_logger = MagicMock(spec=TensorboardLogger) mock_logger.writer = MagicMock() @@ -212,7 +206,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -267,7 +260,6 @@ def global_step_transform(*args, **kwargs): def test_weights_scalar_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = WeightsScalarHandler(model) mock_logger = MagicMock() @@ -277,7 +269,6 @@ def test_weights_scalar_handler_wrong_setup(): def test_weights_scalar_handler(dummy_model_factory): - model = dummy_model_factory(with_grads=True, with_frozen_layer=False) # define test wrapper to test with and without optional tag @@ -310,7 +301,6 @@ def _test(tag=None): def test_weights_scalar_handler_whitelist(dummy_model_factory): - model = dummy_model_factory() wrapper = WeightsScalarHandler(model, whitelist=["fc2.weight"]) @@ -355,7 +345,6 @@ def weight_selector(n, _): def test_weights_hist_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = WeightsHistHandler(model) mock_logger = MagicMock() @@ -365,7 +354,6 @@ def test_weights_hist_handler_wrong_setup(): def test_weights_hist_handler(dummy_model_factory): - model = dummy_model_factory(with_grads=True, with_frozen_layer=False) # define test wrapper to test with and without optional tag @@ -442,7 +430,6 @@ def weight_selector(n, _): def test_grads_scalar_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = GradsScalarHandler(model) mock_logger = MagicMock() @@ -530,7 +517,6 @@ def weight_selector(n, _): def test_grads_hist_handler_wrong_setup(): - model = MagicMock(spec=torch.nn.Module) wrapper = GradsHistHandler(model) mock_logger = MagicMock() @@ -616,7 +602,6 @@ def weight_selector(n, _): def test_integration(dirname): - n_epochs = 5 data = list(range(50)) @@ -646,7 +631,6 @@ def dummy_handler(engine, logger, event_name): def test_integration_as_context_manager(dirname): - n_epochs = 5 data = list(range(50)) @@ -657,7 +641,6 @@ def update_fn(engine, batch): return next(losses_iter) with TensorboardLogger(log_dir=dirname) as tb_logger: - trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): diff --git a/tests/ignite/contrib/handlers/test_tqdm_logger.py b/tests/ignite/contrib/handlers/test_tqdm_logger.py index 23068e85b3b1..81522b1d0a67 100644 --- a/tests/ignite/contrib/handlers/test_tqdm_logger.py +++ b/tests/ignite/contrib/handlers/test_tqdm_logger.py @@ -9,7 +9,7 @@ import torch from packaging.version import Version -from ignite.contrib.handlers import ProgressBar +from ignite.contrib.handlers import CustomPeriodicEvent, ProgressBar from ignite.engine import Engine, Events from ignite.handlers import TerminateOnNan from ignite.metrics import RunningAverage @@ -41,7 +41,6 @@ def test_pbar_errors(): def test_pbar(capsys): - n_epochs = 2 loader = [1, 2] engine = Engine(update_fn) @@ -143,7 +142,6 @@ def print_iter(_): def test_pbar_with_metric(capsys): - n_iters = 2 data = list(range(n_iters)) loss_values = iter(range(n_iters)) @@ -174,7 +172,6 @@ def step(engine, batch): def test_pbar_with_all_metric(capsys): - n_iters = 2 data = list(range(n_iters)) loss_values = iter(range(n_iters)) @@ -208,7 +205,6 @@ def step(engine, batch): def test_pbar_with_state_attrs(capsys): - n_iters = 2 data = list(range(n_iters)) loss_values = iter(range(n_iters)) @@ -246,7 +242,6 @@ def step(engine, batch): def test_pbar_no_metric_names(capsys): - n_epochs = 2 loader = [1, 2] engine = Engine(update_fn) @@ -412,7 +407,6 @@ def update_fn(engine, batch): def test_pbar_on_epochs(capsys): - n_epochs = 10 loader = [1, 2, 3, 4, 5] engine = Engine(update_fn) @@ -452,7 +446,6 @@ def test_pbar_with_max_epochs_set_to_one(capsys): def test_pbar_wrong_events_order(): - engine = Engine(update_fn) pbar = ProgressBar() @@ -475,6 +468,16 @@ def test_pbar_wrong_events_order(): pbar.attach(engine, event_name=Events.ITERATION_STARTED, closing_event_name=Events.EPOCH_COMPLETED(every=10)) +def test_pbar_on_custom_events(capsys): + engine = Engine(update_fn) + pbar = ProgressBar() + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_iterations=15) + + with pytest.raises(ValueError, match=r"not in allowed events for this engine"): + pbar.attach(engine, event_name=cpe.Events.ITERATIONS_15_COMPLETED, closing_event_name=Events.EPOCH_COMPLETED) + + def test_pbar_with_nan_input(): def update(engine, batch): x = batch @@ -504,7 +507,6 @@ def create_engine(): def test_pbar_on_callable_events(capsys): - n_epochs = 1 loader = list(range(100)) engine = Engine(update_fn) @@ -539,7 +541,6 @@ def test_tqdm_logger_epoch_length(capsys): def test_tqdm_logger_iter_without_epoch_length(capsys): - size = 11 def finite_size_data_iter(size): diff --git a/tests/ignite/contrib/handlers/test_visdom_logger.py b/tests/ignite/contrib/handlers/test_visdom_logger.py index 1b980ffbac40..39db6558bd4f 100644 --- a/tests/ignite/contrib/handlers/test_visdom_logger.py +++ b/tests/ignite/contrib/handlers/test_visdom_logger.py @@ -17,7 +17,6 @@ def test_optimizer_params_handler_wrong_setup(): - with pytest.raises(TypeError): OptimizerParamsHandler(optimizer=None) @@ -31,7 +30,6 @@ def test_optimizer_params_handler_wrong_setup(): def test_optimizer_params(): - optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") mock_logger = MagicMock(spec=VisdomLogger) @@ -79,7 +77,6 @@ def test_optimizer_params(): def test_output_handler_with_wrong_logger_type(): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock() @@ -89,7 +86,6 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(dirname): - wrapper = OutputHandler("tag", output_transform=lambda x: x) mock_logger = MagicMock(spec=VisdomLogger) mock_logger.vis = MagicMock() @@ -137,7 +133,6 @@ def test_output_handler_output_transform(dirname): def test_output_handler_metric_names(dirname): - wrapper = OutputHandler("tag", metric_names=["a", "b"]) mock_logger = MagicMock(spec=VisdomLogger) mock_logger.vis = MagicMock() @@ -314,7 +309,6 @@ def test_output_handler_metric_names(dirname): def test_output_handler_both(dirname): - wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) mock_logger = MagicMock(spec=VisdomLogger) mock_logger.vis = MagicMock() @@ -543,7 +537,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -605,7 +598,6 @@ def test_output_handler_with_global_step_from_engine(): def test_weights_scalar_handler_wrong_setup(): - with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"): WeightsScalarHandler(None) @@ -770,7 +762,6 @@ def norm(x): def test_grads_scalar_handler_wrong_setup(): - with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"): GradsScalarHandler(None) @@ -852,7 +843,6 @@ def _test(tag=None): @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows") def test_integration_no_server(): - with pytest.raises(ConnectionError, match="Error connecting to Visdom server"): VisdomLogger() @@ -958,7 +948,6 @@ def update_fn(engine, batch): @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows") def test_integration_with_executor_as_context_manager(visdom_server, visdom_server_stop): - n_epochs = 5 data = list(range(50)) @@ -969,7 +958,6 @@ def update_fn(engine, batch): return next(losses_iter) with VisdomLogger(server=visdom_server[0], port=visdom_server[1], num_workers=1) as vd_logger: - # close all windows in 'main' environment vd_logger.vis.close() @@ -994,7 +982,6 @@ def update_fn(engine, batch): @pytest.mark.parametrize("no_site_packages", ["visdom"], indirect=True) def test_no_visdom(no_site_packages): - with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires visdom package"): VisdomLogger() diff --git a/tests/ignite/contrib/handlers/test_wandb_logger.py b/tests/ignite/contrib/handlers/test_wandb_logger.py index 102d057281c3..821035568381 100644 --- a/tests/ignite/contrib/handlers/test_wandb_logger.py +++ b/tests/ignite/contrib/handlers/test_wandb_logger.py @@ -208,7 +208,6 @@ def global_step_transform(*args, **kwargs): def test_output_handler_with_global_step_from_engine(): - mock_another_engine = MagicMock() mock_another_engine.state = State() mock_another_engine.state.epoch = 10 @@ -283,7 +282,6 @@ def test_wandb_close(): @pytest.mark.parametrize("no_site_packages", ["wandb"], indirect=True) def test_no_wandb_client(no_site_packages): - with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires wandb to be installed."): WandBLogger() diff --git a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py index 912e3d22560a..93e2546aa820 100644 --- a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py +++ b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py @@ -132,7 +132,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() canberra = DistanceMetric.get_metric("canberra") @@ -186,7 +185,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -195,7 +193,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -205,7 +202,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -217,7 +213,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -227,7 +222,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py index a0bc1b30b05d..ef9784697c51 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py @@ -102,7 +102,6 @@ def get_test_cases(): def _test_distrib_compute(device): - rank = idist.get_rank() def _test(metric_device): @@ -135,7 +134,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -192,7 +190,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -201,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -222,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -232,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py index 252313da4e86..105e7fe4aac1 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py @@ -142,7 +142,6 @@ def _test(metric_device): def _test_distrib_integration(device, tol=1e-5): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -199,7 +198,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -208,7 +206,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -218,7 +215,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -230,7 +226,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -240,7 +235,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py index 841d14584229..e9d6e42ccf7a 100644 --- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py @@ -142,7 +142,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -197,7 +196,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -206,7 +204,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -216,7 +213,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -237,7 +233,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py index 5a03a0dfbb87..ccc7c28de2a5 100644 --- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py @@ -46,7 +46,6 @@ def test_compute(): def test_integration(): - y_pred = torch.rand(size=(100,)) y = torch.rand(size=(100,)) @@ -77,7 +76,6 @@ def update_fn(engine, batch): def _test_distrib_compute(device): - rank = idist.get_rank() def _test(metric_device): @@ -107,7 +105,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() torch.manual_seed(12) @@ -161,7 +158,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -170,7 +166,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -191,7 +186,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -201,7 +195,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py index ae7606f18372..5b5090d90807 100644 --- a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py @@ -132,7 +132,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() manhattan = DistanceMetric.get_metric("manhattan") @@ -187,7 +186,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -196,7 +194,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -206,7 +203,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -218,7 +214,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -228,7 +223,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py index 87d94cde4c85..fe6ba11bb007 100644 --- a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py @@ -130,7 +130,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -183,7 +182,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -192,7 +190,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -202,7 +199,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -214,7 +210,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -224,7 +219,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py index 56bf6f84628d..81b1fbbbe05a 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py @@ -152,7 +152,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -207,7 +206,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -216,7 +214,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -226,7 +223,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -238,7 +234,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -248,7 +243,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_error.py b/tests/ignite/contrib/metrics/regression/test_mean_error.py index 0a8894621da3..39f90f011833 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_error.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_error.py @@ -99,7 +99,6 @@ def get_test_cases(): def _test_distrib_compute(device): - rank = idist.get_rank() def _test(metric_device): @@ -131,7 +130,6 @@ def _test(metric_device): def _test_distrib_integration(device, tol=1e-5): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -186,7 +184,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -195,7 +192,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -216,7 +212,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -226,7 +221,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py index ccfa4650821a..7177b01e8c17 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py @@ -146,7 +146,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -201,7 +200,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -210,7 +208,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -220,7 +217,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -232,7 +228,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -242,7 +237,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py index e638abbfa8fc..615d90fbeb1d 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py @@ -35,7 +35,6 @@ def test_wrong_input_shapes(): def test_median_absolute_error(): - # See https://github.com/torch/torch7/pull/182 # For even number of elements, PyTorch returns middle element # NumPy returns average of middle elements @@ -57,7 +56,6 @@ def test_median_absolute_error(): def test_median_absolute_error_2(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -80,7 +78,6 @@ def test_median_absolute_error_2(): def test_integration_median_absolute_error(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -192,7 +189,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -201,7 +197,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -211,7 +206,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -223,7 +217,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -233,7 +226,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py index ea6ec1b67c42..2973a28f193c 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py @@ -35,7 +35,6 @@ def test_wrong_input_shapes(): def test_median_absolute_percentage_error(): - # See https://github.com/torch/torch7/pull/182 # For even number of elements, PyTorch returns middle element # NumPy returns average of middle elements @@ -57,7 +56,6 @@ def test_median_absolute_percentage_error(): def test_median_absolute_percentage_error_2(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -80,7 +78,6 @@ def test_median_absolute_percentage_error_2(): def test_integration_median_absolute_percentage_error(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -193,7 +190,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -202,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -212,7 +207,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -224,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -234,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py index 92a6e0591300..a43c46c307e3 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py @@ -35,7 +35,6 @@ def test_wrong_input_shapes(): def test_median_relative_absolute_error(): - # See https://github.com/torch/torch7/pull/182 # For even number of elements, PyTorch returns middle element # NumPy returns average of middle elements @@ -57,7 +56,6 @@ def test_median_relative_absolute_error(): def test_median_relative_absolute_error_2(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -80,7 +78,6 @@ def test_median_relative_absolute_error_2(): def test_integration_median_relative_absolute_error_with_output_transform(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -193,7 +190,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -202,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -212,7 +207,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -224,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -234,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_r2_score.py b/tests/ignite/contrib/metrics/regression/test_r2_score.py index 95e131d1d33e..86113c4b532d 100644 --- a/tests/ignite/contrib/metrics/regression/test_r2_score.py +++ b/tests/ignite/contrib/metrics/regression/test_r2_score.py @@ -28,7 +28,6 @@ def test_wrong_input_shapes(): def test_r2_score(): - size = 51 np_y_pred = np.random.rand(size) np_y = np.random.rand(size) @@ -44,7 +43,6 @@ def test_r2_score(): def test_r2_score_2(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -66,7 +64,6 @@ def test_r2_score_2(): def test_integration_r2_score(): - np.random.seed(1) size = 105 np_y_pred = np.random.rand(size, 1) @@ -121,7 +118,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -173,7 +169,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -182,7 +177,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -192,7 +186,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -204,7 +197,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -214,7 +206,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py index 4d92a611bd2c..bb615adb086c 100644 --- a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py @@ -115,7 +115,6 @@ def _test(metric_device): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -168,7 +167,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -177,7 +175,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -187,7 +184,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -199,7 +195,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -209,7 +204,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/test_average_precision.py b/tests/ignite/contrib/metrics/test_average_precision.py index 22ff66533dc7..7a943ae855e4 100644 --- a/tests/ignite/contrib/metrics/test_average_precision.py +++ b/tests/ignite/contrib/metrics/test_average_precision.py @@ -63,102 +63,89 @@ def test_check_shape(): ap._check_shape((torch.rand(4, 3), torch.rand(4, 3, 1))) -def test_binary_and_multilabel_inputs(): +@pytest.fixture(params=[item for item in range(8)]) +def test_data_binary_and_multilabel(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1), + # updated batches + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), + # Binary input data of shape (N, L) + (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1), + (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1), + # updated batches + (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16), + (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16), + ][request.param] + + +@pytest.mark.parametrize("n_times", range(5)) +def test_binary_and_multilabel_inputs(n_times, test_data_binary_and_multilabel): + y_pred, y, batch_size = test_data_binary_and_multilabel ap = AveragePrecision() + ap.reset() + if batch_size > 1: + n_iters = y.shape[0] // batch_size + 1 + for i in range(n_iters): + idx = i * batch_size + ap.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) + else: + ap.update((y_pred, y)) - def _test(y_pred, y, batch_size): - ap.reset() - if batch_size > 1: - n_iters = y.shape[0] // batch_size + 1 - for i in range(n_iters): - idx = i * batch_size - ap.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) - else: - ap.update((y_pred, y)) - - np_y = y.numpy() - np_y_pred = y_pred.numpy() - - res = ap.compute() - assert isinstance(res, float) - assert average_precision_score(np_y, np_y_pred) == pytest.approx(res) - - def get_test_cases(): - - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1), - # updated batches - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), - # Binary input data of shape (N, L) - (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1), - (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1), - # updated batches - (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16), - (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16), - ] - - return test_cases + np_y = y.numpy() + np_y_pred = y_pred.numpy() - for _ in range(5): - # check multiple random inputs as random exact occurencies are rare - test_cases = get_test_cases() - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + res = ap.compute() + assert isinstance(res, float) + assert average_precision_score(np_y, np_y_pred) == pytest.approx(res) -def test_integration_binary_and_mulitlabel_inputs(): - def _test(y_pred, y, batch_size): - def update_fn(engine, batch): - idx = (engine.state.iteration - 1) * batch_size - y_true_batch = np_y[idx : idx + batch_size] - y_pred_batch = np_y_pred[idx : idx + batch_size] - return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) +@pytest.fixture(params=[item for item in range(4)]) +def test_data_integration_binary_and_multilabel(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10), + (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10), + # Binary input data of shape (N, L) + (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10), + (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10), + ][request.param] - engine = Engine(update_fn) - ap_metric = AveragePrecision() - ap_metric.attach(engine, "ap") +@pytest.mark.parametrize("n_times", range(5)) +def test_integration_binary_and_mulitlabel_inputs(n_times, test_data_integration_binary_and_multilabel): + y_pred, y, batch_size = test_data_integration_binary_and_multilabel - np_y = y.numpy() - np_y_pred = y_pred.numpy() + def update_fn(engine, batch): + idx = (engine.state.iteration - 1) * batch_size + y_true_batch = np_y[idx : idx + batch_size] + y_pred_batch = np_y_pred[idx : idx + batch_size] + return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) - np_ap = average_precision_score(np_y, np_y_pred) + engine = Engine(update_fn) - data = list(range(y_pred.shape[0] // batch_size)) - ap = engine.run(data, max_epochs=1).metrics["ap"] + ap_metric = AveragePrecision() + ap_metric.attach(engine, "ap") - assert isinstance(ap, float) - assert np_ap == pytest.approx(ap) + np_y = y.numpy() + np_y_pred = y_pred.numpy() - def get_test_cases(): + np_ap = average_precision_score(np_y, np_y_pred) - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10), - (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10), - # Binary input data of shape (N, L) - (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10), - (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10), - ] - return test_cases + data = list(range(y_pred.shape[0] // batch_size)) + ap = engine.run(data, max_epochs=1).metrics["ap"] - for _ in range(5): - # check multiple random inputs as random exact occurencies are rare - test_cases = get_test_cases() - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + assert isinstance(ap, float) + assert np_ap == pytest.approx(ap) def _test_distrib_binary_and_multilabel_inputs(device): - rank = idist.get_rank() torch.manual_seed(12) def _test(y_pred, y, batch_size, metric_device): - metric_device = torch.device(metric_device) ap = AveragePrecision(device=metric_device) torch.manual_seed(10 + rank) @@ -185,7 +172,6 @@ def _test(y_pred, y, batch_size, metric_device): assert average_precision_score(np_y, np_y_pred) == pytest.approx(res) def get_test_cases(): - test_cases = [ # Binary input data of shape (N,) or (N, 1) (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1), @@ -213,7 +199,6 @@ def get_test_cases(): def _test_distrib_integration_binary_input(device): - rank = idist.get_rank() n_iters = 80 batch_size = 16 @@ -283,7 +268,6 @@ def update_fn(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -292,7 +276,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -302,7 +285,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -314,7 +296,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -324,7 +305,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -334,7 +314,6 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_cohen_kappa.py b/tests/ignite/contrib/metrics/test_cohen_kappa.py index ea199cd624d1..fa73a84cdfae 100644 --- a/tests/ignite/contrib/metrics/test_cohen_kappa.py +++ b/tests/ignite/contrib/metrics/test_cohen_kappa.py @@ -71,44 +71,38 @@ def test_cohen_kappa_wrong_weights_type(): ck = CohenKappa(weights="dd") -@pytest.mark.parametrize("weights", [None, "linear", "quadratic"]) -def test_binary_input(weights): +@pytest.fixture(params=range(4)) +def test_data_binary(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1), + (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1), + # updated batches + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), + ][request.param] - ck = CohenKappa(weights) - def _test(y_pred, y, batch_size): - ck.reset() - if batch_size > 1: - n_iters = y.shape[0] // batch_size + 1 - for i in range(n_iters): - idx = i * batch_size - ck.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) - else: - ck.update((y_pred, y)) +@pytest.mark.parametrize("n_times", range(5)) +@pytest.mark.parametrize("weights", [None, "linear", "quadratic"]) +def test_binary_input(n_times, weights, test_data_binary): + y_pred, y, batch_size = test_data_binary + ck = CohenKappa(weights) + ck.reset() + if batch_size > 1: + n_iters = y.shape[0] // batch_size + 1 + for i in range(n_iters): + idx = i * batch_size + ck.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) + else: + ck.update((y_pred, y)) - np_y = y.numpy() - np_y_pred = y_pred.numpy() + np_y = y.numpy() + np_y_pred = y_pred.numpy() - res = ck.compute() - assert isinstance(res, float) - assert cohen_kappa_score(np_y, np_y_pred, weights=weights) == pytest.approx(res) - - def get_test_cases(): - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1), - (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1), - # updated batches - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), - ] - return test_cases - - for _ in range(5): - # check multiple random inputs as random exact occurencies are rare - test_cases = get_test_cases() - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + res = ck.compute() + assert isinstance(res, float) + assert cohen_kappa_score(np_y, np_y_pred, weights=weights) == pytest.approx(res) def test_multilabel_inputs(): @@ -130,52 +124,47 @@ def test_multilabel_inputs(): ck.compute() -@pytest.mark.parametrize("weights", [None, "linear", "quadratic"]) -def test_integration_binary_input(weights): - def _test(y_pred, y, batch_size): - def update_fn(engine, batch): - idx = (engine.state.iteration - 1) * batch_size - y_true_batch = np_y[idx : idx + batch_size] - y_pred_batch = np_y_pred[idx : idx + batch_size] - return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) +@pytest.fixture(params=range(2)) +def test_data_integration_binary(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 10), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 10), + ][request.param] - engine = Engine(update_fn) - ck_metric = CohenKappa(weights=weights) - ck_metric.attach(engine, "ck") +@pytest.mark.parametrize("n_times", range(5)) +@pytest.mark.parametrize("weights", [None, "linear", "quadratic"]) +def test_integration_binary_input(n_times, weights, test_data_integration_binary): + y_pred, y, batch_size = test_data_integration_binary - np_y = y.numpy() - np_y_pred = y_pred.numpy() + def update_fn(engine, batch): + idx = (engine.state.iteration - 1) * batch_size + y_true_batch = np_y[idx : idx + batch_size] + y_pred_batch = np_y_pred[idx : idx + batch_size] + return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) - np_ck = cohen_kappa_score(np_y, np_y_pred, weights=weights) + engine = Engine(update_fn) - data = list(range(y_pred.shape[0] // batch_size)) - ck = engine.run(data, max_epochs=1).metrics["ck"] + ck_metric = CohenKappa(weights=weights) + ck_metric.attach(engine, "ck") - assert isinstance(ck, float) - assert np_ck == pytest.approx(ck) + np_y = y.numpy() + np_y_pred = y_pred.numpy() - def get_test_cases(): - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 10), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 10), - ] - return test_cases + np_ck = cohen_kappa_score(np_y, np_y_pred, weights=weights) - for _ in range(5): - # check multiple random inputs as random exact occurencies are rare - test_cases = get_test_cases() - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + data = list(range(y_pred.shape[0] // batch_size)) + ck = engine.run(data, max_epochs=1).metrics["ck"] + assert isinstance(ck, float) + assert np_ck == pytest.approx(ck) -def _test_distrib_binary_input(device): +def _test_distrib_binary_input(device): rank = idist.get_rank() def _test(y_pred, y, batch_size, metric_device): - metric_device = torch.device(metric_device) ck = CohenKappa(device=metric_device) @@ -220,7 +209,6 @@ def get_test_cases(): def _test_distrib_integration_binary_input(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -274,7 +262,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -283,7 +270,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -293,7 +279,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -305,7 +290,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -315,7 +299,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -325,14 +308,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) def _test_distrib_xla_nprocs(index): - device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_precision_recall_curve.py b/tests/ignite/contrib/metrics/test_precision_recall_curve.py index 6ad007747ed4..1eaf8ddc8b3c 100644 --- a/tests/ignite/contrib/metrics/test_precision_recall_curve.py +++ b/tests/ignite/contrib/metrics/test_precision_recall_curve.py @@ -141,7 +141,6 @@ def _test_distrib_compute(device): rank = idist.get_rank() def _test(y_pred, y, batch_size, metric_device): - metric_device = torch.device(metric_device) prc = PrecisionRecallCurve(device=metric_device) @@ -191,7 +190,6 @@ def get_test_cases(): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -249,7 +247,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -258,7 +255,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -268,7 +264,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -280,7 +275,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -290,7 +284,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/test_roc_auc.py b/tests/ignite/contrib/metrics/test_roc_auc.py index a29fc73dd668..dcc14aaba301 100644 --- a/tests/ignite/contrib/metrics/test_roc_auc.py +++ b/tests/ignite/contrib/metrics/test_roc_auc.py @@ -64,49 +64,43 @@ def test_check_shape(): roc_auc._check_shape((torch.rand(4, 3), torch.rand(4, 3, 1))) -def test_binary_and_multilabel_inputs(): - +@pytest.fixture(params=range(8)) +def test_data_binary_and_multilabel(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1), + # updated batches + (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), + (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), + # Binary input data of shape (N, L) + (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1), + (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1), + # updated batches + (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16), + (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16), + ][request.param] + + +@pytest.mark.parametrize("n_times", range(5)) +def test_binary_and_multilabel_inputs(n_times, test_data_binary_and_multilabel): + y_pred, y, batch_size = test_data_binary_and_multilabel roc_auc = ROC_AUC() + roc_auc.reset() + if batch_size > 1: + n_iters = y.shape[0] // batch_size + 1 + for i in range(n_iters): + idx = i * batch_size + roc_auc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) + else: + roc_auc.update((y_pred, y)) - def _test(y_pred, y, batch_size): - roc_auc.reset() - if batch_size > 1: - n_iters = y.shape[0] // batch_size + 1 - for i in range(n_iters): - idx = i * batch_size - roc_auc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) - else: - roc_auc.update((y_pred, y)) + np_y = y.numpy() + np_y_pred = y_pred.numpy() - np_y = y.numpy() - np_y_pred = y_pred.numpy() - - res = roc_auc.compute() - assert isinstance(res, float) - assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res) - - def get_test_cases(): - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1), - # updated batches - (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16), - (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16), - # Binary input data of shape (N, L) - (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1), - (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1), - # updated batches - (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16), - (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16), - ] - return test_cases - - for _ in range(5): - test_cases = get_test_cases() - # check multiple random inputs as random exact occurencies are rare - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + res = roc_auc.compute() + assert isinstance(res, float) + assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res) def test_check_compute_fn(): @@ -125,50 +119,46 @@ def test_check_compute_fn(): em.update(output) -def test_integration_binary_and_multilabel_inputs(): - def _test(y_pred, y, batch_size): - def update_fn(engine, batch): - idx = (engine.state.iteration - 1) * batch_size - y_true_batch = np_y[idx : idx + batch_size] - y_pred_batch = np_y_pred[idx : idx + batch_size] - return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) +@pytest.fixture(params=range(4)) +def test_data_integration_binary_and_multilabel(request): + return [ + # Binary input data of shape (N,) or (N, 1) + (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10), + (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10), + # Binary input data of shape (N, L) + (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10), + (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10), + ][request.param] - engine = Engine(update_fn) - roc_auc_metric = ROC_AUC() - roc_auc_metric.attach(engine, "roc_auc") +@pytest.mark.parametrize("n_times", range(5)) +def test_integration_binary_and_multilabel_inputs(n_times, test_data_integration_binary_and_multilabel): + y_pred, y, batch_size = test_data_integration_binary_and_multilabel - np_y = y.numpy() - np_y_pred = y_pred.numpy() + def update_fn(engine, batch): + idx = (engine.state.iteration - 1) * batch_size + y_true_batch = np_y[idx : idx + batch_size] + y_pred_batch = np_y_pred[idx : idx + batch_size] + return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) - np_roc_auc = roc_auc_score(np_y, np_y_pred) + engine = Engine(update_fn) - data = list(range(y_pred.shape[0] // batch_size)) - roc_auc = engine.run(data, max_epochs=1).metrics["roc_auc"] + roc_auc_metric = ROC_AUC() + roc_auc_metric.attach(engine, "roc_auc") - assert isinstance(roc_auc, float) - assert np_roc_auc == pytest.approx(roc_auc) + np_y = y.numpy() + np_y_pred = y_pred.numpy() - def get_test_cases(): - test_cases = [ - # Binary input data of shape (N,) or (N, 1) - (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10), - (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10), - # Binary input data of shape (N, L) - (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10), - (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10), - ] - return test_cases + np_roc_auc = roc_auc_score(np_y, np_y_pred) - for _ in range(5): - # check multiple random inputs as random exact occurencies are rare - test_cases = get_test_cases() - for y_pred, y, batch_size in test_cases: - _test(y_pred, y, batch_size) + data = list(range(y_pred.shape[0] // batch_size)) + roc_auc = engine.run(data, max_epochs=1).metrics["roc_auc"] + assert isinstance(roc_auc, float) + assert np_roc_auc == pytest.approx(roc_auc) -def _test_distrib_binary_and_multilabel_inputs(device): +def _test_distrib_binary_and_multilabel_inputs(device): rank = idist.get_rank() def _test(y_pred, y, batch_size, metric_device): @@ -222,7 +212,6 @@ def get_test_cases(): def _test_distrib_integration_binary_input(device): - rank = idist.get_rank() n_iters = 80 batch_size = 16 @@ -292,7 +281,6 @@ def update_fn(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -301,7 +289,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -311,7 +298,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -323,7 +309,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -333,7 +318,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -343,14 +327,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) def _test_distrib_xla_nprocs(index): - device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/distributed/check_idist_parallel.py b/tests/ignite/distributed/check_idist_parallel.py index def9e798a24b..04e294cbfbeb 100644 --- a/tests/ignite/distributed/check_idist_parallel.py +++ b/tests/ignite/distributed/check_idist_parallel.py @@ -6,7 +6,6 @@ def training(local_rank, config, **kwargs): - import time time.sleep(idist.get_rank() * 0.1) diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py index e795f887ed62..264813cd584b 100644 --- a/tests/ignite/distributed/comp_models/test_horovod.py +++ b/tests/ignite/distributed/comp_models/test_horovod.py @@ -18,7 +18,6 @@ def test__hvd_dist_model(): def _assert_model(model, true_conf): - if "cuda" in true_conf["device"]: assert model.device() == torch.device(f"{true_conf['device']}:{true_conf['local_rank']}") else: @@ -33,7 +32,6 @@ def _assert_model(model, true_conf): def _test__hvd_dist_model_create_from_backend_no_dist(backend, true_device): - model = _HorovodDistModel.create_from_backend(backend=backend) assert hvd.rank() > -1 @@ -54,7 +52,6 @@ def _test__hvd_dist_model_create_from_backend_no_dist(backend, true_device): def _test__hvd_dist_model_create_from_backend_dist(backend, true_device): - model = _HorovodDistModel.create_from_backend(backend=backend) assert hvd.rank() > -1 @@ -79,7 +76,6 @@ def _test__hvd_dist_model_create_from_backend_dist(backend, true_device): def _test__hvd_dist_model_create_from_context_no_dist(true_backend, true_device): - with pytest.raises(ValueError, match=r"Horovod has not been initialized"): hvd.rank() @@ -105,7 +101,6 @@ def _test__hvd_dist_model_create_from_context_no_dist(true_backend, true_device) def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device): - assert _HorovodDistModel.create_from_context() is None hvd.init() @@ -169,7 +164,6 @@ def test__hvd_dist_model_create_dist_cuda_2(gloo_hvd_executor): def _test__hvd_dist_model_warning_index_less_localrank(): - assert torch.cuda.is_available() assert _HorovodDistModel.create_from_context() is None @@ -190,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank(): @pytest.mark.distributed @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs") def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor): - gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count()) + gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count()) def _test_dist_spawn_fn(local_rank, backend, world_size, device): diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index 875fd78431e2..09e4d3054601 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -95,7 +95,6 @@ def test__native_dist_model(): @pytest.mark.skipif(not dist.is_nccl_available(), reason="Skip if nccl not available") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test__native_nccl_but_no_gpu(mock_gpu_is_not_available): - with pytest.raises(RuntimeError, match=r"Nccl backend is required but no cuda capable devices"): _NativeDistModel(backend="nccl") @@ -152,7 +151,6 @@ def test__native_dist_model_create_from_backend_bad_slurm_config(): def _assert_model(model, true_conf): - assert model.device() == torch.device(true_conf["device"]) assert model.get_local_rank() == true_conf["local_rank"] assert model.get_rank() == true_conf["rank"] @@ -188,7 +186,6 @@ def _test__native_dist_model_create_from_backend_no_dist(backend, true_device): def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, rank, world_size, backend, true_device): - import os from datetime import timedelta @@ -234,7 +231,6 @@ def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, r def _test__native_dist_model_create_from_backend_slurm(local_rank, rank, world_size, backend, true_device): - import os from datetime import timedelta @@ -292,7 +288,6 @@ def _test__native_dist_model_create_from_backend_slurm(local_rank, rank, world_s def _test__native_dist_model_create_from_context_no_local_rank(): - if "LOCAL_RANK" in os.environ: del os.environ["LOCAL_RANK"] @@ -321,7 +316,6 @@ def _test__native_dist_model_create_from_context_env_local_rank(true_conf): def _test__native_dist_model_create_from_context_set_local_rank(true_conf): - from ignite.distributed.comp_models.base import ComputationModel lrank = None @@ -341,7 +335,6 @@ def _test__native_dist_model_create_from_context_set_local_rank(true_conf): def _test__native_dist_model_create_from_context_no_dist(true_backend, true_device): - assert _NativeDistModel.create_from_context() is None dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=1, rank=0) @@ -366,7 +359,6 @@ def _test__native_dist_model_create_from_context_no_dist(true_backend, true_devi def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device): - assert _NativeDistModel.create_from_context() is None dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=world_size, rank=rank) @@ -422,7 +414,6 @@ def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, l @pytest.mark.distributed def test__native_dist_model_create_dist_gloo_2(local_rank, world_size): - device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", device) _test__native_dist_model_create_from_backend_slurm(local_rank, local_rank, world_size, "gloo", device) @@ -454,7 +445,6 @@ def test__native_dist_model_create_dist_nccl_2(local_rank, world_size): @pytest.mark.distributed @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs") def test__native_dist_model_warning_index_less_localrank(local_rank, world_size): - assert _NativeDistModel.create_from_context() is None dist.init_process_group("nccl", "tcp://0.0.0.0:2222", world_size=world_size, rank=local_rank) diff --git a/tests/ignite/distributed/comp_models/test_xla.py b/tests/ignite/distributed/comp_models/test_xla.py index 001a7741b596..6352895833d0 100644 --- a/tests/ignite/distributed/comp_models/test_xla.py +++ b/tests/ignite/distributed/comp_models/test_xla.py @@ -59,7 +59,6 @@ def test__xla_dist_model_spawn_n_procs(): def _assert_model(model, true_conf): - assert model.device() == true_conf["device"] assert model.get_local_rank() == true_conf["local_rank"] assert model.get_rank() == true_conf["rank"] diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py index 60c1c12d5788..cde9892b8dec 100644 --- a/tests/ignite/distributed/test_auto.py +++ b/tests/ignite/distributed/test_auto.py @@ -180,7 +180,6 @@ def _test_auto_model_optimizer(ws, device): def test_auto_methods_no_dist(): - _test_auto_dataloader(1, 1, batch_size=1) _test_auto_dataloader(1, 1, batch_size=10, num_workers=2) _test_auto_dataloader(1, 1, batch_size=10, sampler_name="WeightedRandomSampler") @@ -192,7 +191,6 @@ def test_auto_methods_no_dist(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_auto_methods_gloo(distributed_context_single_node_gloo): - ws = distributed_context_single_node_gloo["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) @@ -217,7 +215,6 @@ def test_auto_methods_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_auto_methods_nccl(distributed_context_single_node_nccl): - ws = distributed_context_single_node_nccl["world_size"] _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10) @@ -236,7 +233,6 @@ def test_auto_methods_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_auto_methods_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -249,10 +245,8 @@ def test_auto_methods_hvd(gloo_hvd_executor): def _test_auto_methods_xla(index, ws): - dl_type = DataLoader if ws > 1: - from ignite.distributed.auto import _MpDeviceLoader dl_type = _MpDeviceLoader @@ -288,7 +282,6 @@ def test_auto_methods_xla(): def test_dist_proxy_sampler(): - weights = torch.ones(100) weights[:50] += 1 num_samples = 200 diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py index e058988658a8..04e1e20b7c07 100644 --- a/tests/ignite/distributed/test_launcher.py +++ b/tests/ignite/distributed/test_launcher.py @@ -42,7 +42,6 @@ def exec_filepath(): def execute(cmd, env=None): - import ignite env = dict(os.environ) if env is None else env @@ -268,7 +267,6 @@ def test_idist_parallel_no_dist(): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_idist_parallel_spawn_params_xla(): - res = idist.Parallel._setup_spawn_params( nproc_per_node=8, nnodes=None, node_rank=None, master_addr=None, master_port=None, start_method="fork" ) diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py index 65498f0afe59..7845f0cd1ce0 100644 --- a/tests/ignite/distributed/utils/__init__.py +++ b/tests/ignite/distributed/utils/__init__.py @@ -72,7 +72,6 @@ def _test_distrib__get_max_length(device): def _test_distrib_all_reduce(device): - res = idist.all_reduce(10) assert res == 10 * idist.get_world_size() @@ -120,7 +119,6 @@ def _test_distrib_all_reduce(device): def _test_distrib_all_reduce_group(device): - if idist.get_world_size() > 1 and idist.backend() is not None: ranks = [0, 1] rank = idist.get_rank() @@ -157,51 +155,71 @@ def _test_distrib_all_reduce_group(device): def _test_distrib_all_gather(device): + rank = idist.get_rank() + ws = idist.get_world_size() res = torch.tensor(idist.all_gather(10), device=device) - true_res = torch.tensor([10] * idist.get_world_size(), device=device) + true_res = torch.tensor([10] * ws, device=device) assert (res == true_res).all() - t = torch.tensor(idist.get_rank(), device=device) + t = torch.tensor(rank, device=device) res = idist.all_gather(t) - true_res = torch.tensor([i for i in range(idist.get_world_size())], device=device) + true_res = torch.tensor([i for i in range(ws)], device=device) assert (res == true_res).all() x = "test-test" - if idist.get_rank() == 0: + if rank == 0: x = "abc" res = idist.all_gather(x) - true_res = ["abc"] + ["test-test"] * (idist.get_world_size() - 1) + true_res = ["abc"] + ["test-test"] * (ws - 1) assert res == true_res base_x = "tests/ignite/distributed/utils/test_native.py" * 2000 x = base_x - if idist.get_rank() == 0: + if rank == 0: x = "abc" res = idist.all_gather(x) - true_res = ["abc"] + [base_x] * (idist.get_world_size() - 1) + true_res = ["abc"] + [base_x] * (ws - 1) assert res == true_res - t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1) + t = torch.arange(100, device=device).reshape(4, 25) * (rank + 1) in_dtype = t.dtype res = idist.all_gather(t) - assert res.shape == (idist.get_world_size() * 4, 25) + assert res.shape == (ws * 4, 25) assert res.dtype == in_dtype - true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device) - for i in range(idist.get_world_size()): + true_res = torch.zeros(ws * 4, 25, device=device) + for i in range(ws): true_res[i * 4 : (i + 1) * 4, ...] = torch.arange(100, device=device).reshape(4, 25) * (i + 1) assert (res == true_res).all() - if idist.get_world_size() > 1: - with pytest.raises(TypeError, match=r"Unhandled input type"): - idist.all_reduce([0, 1, 2]) + if ws > 1 and idist.backend() != "xla-tpu": + t = { + "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)], + "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device), + "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)}, + } + res = idist.all_gather(t) + assert isinstance(res, list) and len(res) == ws + for i, obj in enumerate(res): + assert isinstance(obj, dict) + assert list(obj.keys()) == ["a", "b", "c"], obj + expected_device = ( + device if torch.device(device).type == "cpu" else torch.device(f"{torch.device(device).type}:{i}") + ) + expected = { + "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)], + "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device), + "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)}, + } + assert obj["a"] == expected["a"] + assert (obj["b"] == expected["b"]).all() + assert obj["c"] == expected["c"] def _test_distrib_all_gather_group(device): - if idist.get_world_size() > 1: - ranks = [0, 1] + ranks = list(range(idist.get_world_size() - 1, 0, -1)) # [0, 1, 2, 3] -> [3, 2, 1] rank = idist.get_rank() bnd = idist.backend() @@ -212,7 +230,10 @@ def _test_distrib_all_gather_group(device): res = idist.all_gather(t, group=group) else: res = idist.all_gather(t, group=group) - assert torch.equal(res, torch.tensor(ranks, device=device)) + if rank in ranks: + assert torch.equal(res, torch.tensor(ranks, device=device)) + else: + assert res == t t = torch.tensor([rank], device=device) if bnd in ("horovod"): @@ -220,9 +241,44 @@ def _test_distrib_all_gather_group(device): res = idist.all_gather(t, group=ranks) else: res = idist.all_gather(t, group=ranks) - assert torch.equal(res, torch.tensor(ranks, device=device)) - - ranks = "abc" + if rank in ranks: + assert torch.equal(res, torch.tensor(ranks, device=device)) + else: + assert res == t + + t = { + "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)], + "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device), + "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)}, + } + if bnd in ("xla-tpu"): + with pytest.raises(NotImplementedError, match=r"all_gather on object is not implemented for xla"): + res = idist.all_gather(t, group=ranks) + elif bnd in ("horovod"): + with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"): + res = idist.all_gather(t, group=ranks) + else: + res = idist.all_gather(t, group=ranks) + if rank in ranks: + assert isinstance(res, list) and len(res) == len(ranks) + for i, obj in zip(ranks, res): + assert isinstance(obj, dict) + assert list(obj.keys()) == ["a", "b", "c"], obj + expected_device = ( + device + if torch.device(device).type == "cpu" + else torch.device(f"{torch.device(device).type}:{i}") + ) + expected = { + "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)], + "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device), + "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)}, + } + assert obj["a"] == expected["a"], (obj, expected) + assert (obj["b"] == expected["b"]).all(), (obj, expected) + assert obj["c"] == expected["c"], (obj, expected) + else: + assert res == t if bnd in ("nccl", "gloo", "mpi"): with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"): @@ -236,13 +292,11 @@ def _test_distrib_all_gather_group(device): def _test_distrib_broadcast(device): - rank = idist.get_rank() ws = idist.get_world_size() def _test(data_src, data_others, safe_mode): for src in range(ws): - data = data_src if rank == src else data_others res = idist.broadcast(data, src=src, safe_mode=safe_mode) @@ -290,7 +344,6 @@ def _test(data_src, data_others, safe_mode): def _test_distrib_barrier(device): - t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float) true_res = sum([i for i in range(idist.get_world_size())]) @@ -303,12 +356,10 @@ def _test_distrib_barrier(device): def _test_distrib_new_group(device): - if idist.get_world_size() > 1 and idist.backend() is not None: bnd = idist.backend() ranks = [0, 1] if idist.has_native_dist_support and bnd in ("nccl", "gloo", "mpi"): - g1 = idist.new_group(ranks) g2 = dist.new_group(ranks) @@ -316,7 +367,6 @@ def _test_distrib_new_group(device): if rank in ranks: assert g1.rank() == g2.rank() elif idist.has_xla_support and bnd in ("xla-tpu"): - assert idist.new_group(ranks) == [ranks] elif idist.has_hvd_support and bnd in ("horovod"): from horovod.common.process_sets import ProcessSet diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py index fa6c77f81cc1..ead6ed4c330e 100644 --- a/tests/ignite/distributed/utils/test_horovod.py +++ b/tests/ignite/distributed/utils/test_horovod.py @@ -131,7 +131,6 @@ def _test_idist_methods_in_hvd_context(backend, device): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_methods_in_hvd_context(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_idist_methods_in_hvd_context, ("horovod", device), np=np) @@ -141,7 +140,6 @@ def test_idist_methods_in_hvd_context(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_all_reduce_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_all_reduce, (device,), np=np, do_init=True) @@ -152,7 +150,6 @@ def test_idist_all_reduce_hvd(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist__model_methods_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib__get_max_length, (device,), np=np, do_init=True) @@ -162,7 +159,6 @@ def test_idist__model_methods_hvd(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_all_gather_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_all_gather, (device,), np=np, do_init=True) @@ -173,7 +169,6 @@ def test_idist_all_gather_hvd(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_broadcast_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_broadcast, (device,), np=np, do_init=True) @@ -183,7 +178,6 @@ def test_idist_broadcast_hvd(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_barrier_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_barrier, (device,), np=np, do_init=True) @@ -193,7 +187,6 @@ def test_idist_barrier_hvd(gloo_hvd_executor): @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_new_group_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_new_group, (device,), np=np, do_init=True) diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py index b1d885da4e40..fda3e1126ccb 100644 --- a/tests/ignite/distributed/utils/test_native.py +++ b/tests/ignite/distributed/utils/test_native.py @@ -3,6 +3,7 @@ import pytest import torch import torch.distributed as dist +from packaging.version import Version import ignite.distributed as idist from ignite.distributed.utils import has_native_dist_support @@ -37,7 +38,6 @@ def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, wo @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"]) def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirname, local_rank, world_size): - from datetime import timedelta timeout = timedelta(seconds=20) @@ -56,7 +56,6 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirn @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"]) def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirname, local_rank, world_size): - if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared" @@ -81,7 +80,6 @@ def _test_native_distrib_single_node_spawn(init_method, backend, device, **kwarg @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"]) def test_native_distrib_single_node_spawn_gloo(init_method, dirname): - from datetime import timedelta timeout = timedelta(seconds=20) @@ -190,7 +188,6 @@ def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_ @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context_single_node_gloo): - local_rank = distributed_context_single_node_gloo["local_rank"] device = idist.device() _test_idist_methods_in_native_context_set_local_rank("gloo", device, local_rank) @@ -209,7 +206,6 @@ def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist__model_methods_nccl(distributed_context_single_node_nccl): - device = idist.device() _test_distrib__get_max_length(device) @@ -217,7 +213,6 @@ def test_idist__model_methods_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist__model_methods_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib__get_max_length(device) @@ -226,7 +221,6 @@ def test_idist__model_methods_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_all_reduce(device) _test_distrib_all_reduce_group(device) @@ -235,7 +229,6 @@ def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_all_reduce(device) _test_distrib_all_reduce_group(device) @@ -244,8 +237,8 @@ def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="dist.all_gather_object is not implemented") def test_idist_all_gather_nccl(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_all_gather(device) _test_distrib_all_gather_group(device) @@ -253,8 +246,8 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="dist.all_gather_object is not implemented") def test_idist_all_gather_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_all_gather(device) _test_distrib_all_gather_group(device) @@ -264,7 +257,6 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_broadcast_nccl(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_broadcast(device) @@ -272,7 +264,6 @@ def test_idist_broadcast_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_broadcast_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_broadcast(device) @@ -281,7 +272,6 @@ def test_idist_broadcast_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_barrier_nccl(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_barrier(device) @@ -289,7 +279,6 @@ def test_idist_barrier_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_barrier_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_barrier(device) @@ -356,7 +345,6 @@ def test_idist_methods_overhead_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) @@ -366,7 +354,48 @@ def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_one_rank_only_nccl(local_rank, distributed_context_single_node_nccl): - device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) + + +@pytest.mark.distributed +@pytest.mark.parametrize("rank", range(int(os.environ.get("WORLD_SIZE", 1)))) +@pytest.mark.parametrize("local", [True, False]) +def test_one_rank_first(distributed, get_rank_zero_dirname, rank, local): + def get_ds(file_path): + rank = idist.get_local_rank() if local else idist.get_rank() + if not file_path.exists(): + with open(file_path, "w") as f: + f.write("readed") + return f"{rank} not readed" + else: + return f"{rank} readed" + + folder = get_rank_zero_dirname() + file_path = folder / "res.txt" + + with idist.one_rank_first(rank, local=local): + x = get_ds(file_path) + + output = idist.all_gather(x) + + if local: + expected = [ + f"{x} not readed" if x == rank else f"{x} readed" for x in range(idist.get_nproc_per_node()) + ] * idist.get_nnodes() + else: + expected = [f"{x} not readed" if x == rank else f"{x} readed" for x in range(idist.get_world_size())] + + print("expected:", expected, idist.get_nnodes()) + assert set(expected) == set(output) + + +@pytest.mark.distributed +def test_one_rank_first_asserts(): + rank = 100 + with pytest.raises( + ValueError, match=f"rank should be between 0 and {idist.get_world_size() - 1}, but given {rank}" + ): + with idist.one_rank_first(rank): + pass diff --git a/tests/ignite/distributed/utils/test_serial.py b/tests/ignite/distributed/utils/test_serial.py index afae86b78f39..1fee2bb8ce1d 100644 --- a/tests/ignite/distributed/utils/test_serial.py +++ b/tests/ignite/distributed/utils/test_serial.py @@ -14,7 +14,6 @@ def test_no_distrib(capsys): - assert idist.backend() is None if torch.cuda.is_available(): assert idist.device().type == "cuda" diff --git a/tests/ignite/distributed/utils/test_xla.py b/tests/ignite/distributed/utils/test_xla.py index 281e1ba50d81..bb109eacdea9 100644 --- a/tests/ignite/distributed/utils/test_xla.py +++ b/tests/ignite/distributed/utils/test_xla.py @@ -148,7 +148,6 @@ def test_idist_new_group_xla(): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_idist_all_gather_xla(): - device = idist.device() _test_distrib_all_gather(device) _test_distrib_all_gather_group(device) @@ -172,7 +171,6 @@ def test_idist_all_gather_xla_in_child_proc(xmp_executor): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_idist_broadcast_xla(): - device = idist.device() _test_distrib_broadcast(device) @@ -194,7 +192,6 @@ def test_idist_broadcast_xla_in_child_proc(xmp_executor): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_idist_barrier_xla(): - device = idist.device() _test_distrib_barrier(device) @@ -216,7 +213,6 @@ def test_idist_barrier_xla_in_child_proc(xmp_executor): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package") def test_idist_one_rank_only_xla(): - device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) diff --git a/tests/ignite/engine/__init__.py b/tests/ignite/engine/__init__.py index d863b60b67ef..98059e98518c 100644 --- a/tests/ignite/engine/__init__.py +++ b/tests/ignite/engine/__init__.py @@ -5,7 +5,6 @@ except ImportError: class IterableDataset: - pass diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index fa9681df81e0..8d001a8d2cc1 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -31,11 +31,13 @@ def __init__(self, output_as_list=False): self.output_as_list = output_as_list self.fc = torch.nn.Linear(1, 1, bias=False) - def forward(self, x): + def forward(self, x, bias=None): + if bias is None: + bias = 0.0 if self.output_as_list: - return self.fc(x), self.fc(x) + return self.fc(x) + bias, self.fc(x) + bias - return self.fc(x) + return self.fc(x) + bias def _default_create_supervised_trainer( @@ -46,6 +48,7 @@ def _default_create_supervised_trainer( amp_mode: str = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, with_model_transform: bool = False, + with_model_fn: bool = False, ): if with_model_transform: @@ -65,8 +68,8 @@ def get_first_element(output): optimizer = SGD(model.parameters(), 0.1) if trace: - example_input = torch.randn(1) - model = torch.jit.trace(model, example_input) + example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1) + model = torch.jit.trace(model, example_inputs) if amp_mode == "apex" and model_device == trainer_device == "cuda": from apex import amp @@ -83,6 +86,9 @@ def get_first_element(output): scaler=scaler, gradient_accumulation_steps=gradient_accumulation_steps, model_transform=model_transform if model_transform is not None else lambda x: x, + model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device))) + if with_model_fn + else (lambda model, x: model(x)), ) assert model.fc.weight.data[0, 0].item() == approx(0.0) return trainer, model @@ -96,6 +102,7 @@ def _test_create_supervised_trainer( amp_mode: str = None, scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False, with_model_transform: bool = False, + with_model_fn: bool = False, ): trainer, model = _default_create_supervised_trainer( gradient_accumulation_steps=gradient_accumulation_steps, @@ -105,10 +112,13 @@ def _test_create_supervised_trainer( amp_mode=amp_mode, scaler=scaler, with_model_transform=with_model_transform, + with_model_fn=with_model_fn, ) x = torch.tensor([[0.01], [0.02], [0.03], [0.04], [0.05]]) y = torch.tensor([[0.015], [0.025], [0.035], [0.045], [0.055]]) + if with_model_fn: + y += 0.01 data = [(_x, _y) for _x, _y in zip(x, y)] theta = [0.0] @@ -120,12 +130,14 @@ def _(): assert model.fc.weight.grad != 0 _x, _y = trainer.state.batch _x, _y = _x.to(model_device), _y.to(model_device) - accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - _y.item()) + bias = 0.01 if with_model_fn else 0.0 + accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - (_y.item() - bias)) # value of loss should not be accumulated + _y_pred = model(_x, torch.tensor([bias], device=model_device)) if with_model_fn else model(_x) if with_model_transform: - loss[0] = mse_loss(model(_x)[0], _y).item() - else: - loss[0] = mse_loss(model(_x), _y).item() + _y_pred = _y_pred[0] + + loss[0] = mse_loss(_y_pred, _y).item() @trainer.on(Events.ITERATION_COMPLETED(every=gradient_accumulation_steps)) def _(): @@ -135,7 +147,6 @@ def _(): accumulation[0] = loss[0] = 0.0 if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")): - state = trainer.run(data) if amp_mode == "amp": @@ -154,7 +165,6 @@ def _(): @pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0") def test_create_supervised_training_scalar_assignment(): - with mock.patch("ignite.engine._check_arg") as check_arg_mock: check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False) trainer, _ = _default_create_supervised_trainer(model_device="cpu", trainer_device="cpu", scaler=True) @@ -173,7 +183,6 @@ def _test_create_mocked_supervised_trainer( with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock: with mock.patch("ignite.engine.supervised_training_step_tpu") as training_step_tpu_mock: with mock.patch("ignite.engine.supervised_training_step") as training_step_mock: - trainer, _ = _default_create_supervised_trainer( model_device=model_device, trainer_device=trainer_device, @@ -221,6 +230,7 @@ def _default_create_supervised_evaluator( trace: bool = False, amp_mode: str = None, with_model_transform: bool = False, + with_model_fn: bool = False, ): if with_model_transform: @@ -239,14 +249,17 @@ def get_first_element(output): model.fc.weight.data.zero_() if trace: - example_input = torch.randn(1, 1) - model = torch.jit.trace(model, example_input) + example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1) + model = torch.jit.trace(model, example_inputs) evaluator = create_supervised_evaluator( model, device=evaluator_device, amp_mode=amp_mode, model_transform=model_transform if model_transform is not None else lambda x: x, + model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device))) + if with_model_fn + else (lambda model, x: model(x)), ) assert model.fc.weight.data[0, 0].item() == approx(0.0) @@ -260,6 +273,7 @@ def _test_create_supervised_evaluator( trace: bool = False, amp_mode: str = None, with_model_transform: bool = False, + with_model_fn: bool = False, ): model, evaluator = _default_create_supervised_evaluator( model_device=model_device, @@ -267,16 +281,21 @@ def _test_create_supervised_evaluator( trace=trace, amp_mode=amp_mode, with_model_transform=with_model_transform, + with_model_fn=with_model_fn, ) x = torch.tensor([[1.0], [2.0]]) y = torch.tensor([[3.0], [5.0]]) + if with_model_fn: + y += 0.01 data = [(x, y)] if model_device == evaluator_device or ((model_device == "cpu") ^ (evaluator_device == "cpu")): state = evaluator.run(data) y_pred, y = state.output - + if with_model_fn: + y_pred -= 0.01 + y -= 0.01 assert y_pred[0, 0].item() == approx(0.0) assert y_pred[1, 0].item() == approx(0.0) assert y[0, 0].item() == approx(3.0) @@ -325,7 +344,6 @@ def _test_create_evaluation_step_amp( trace: bool = False, amp_mode: str = None, ): - output_transform_mock = MagicMock() model = DummyModel() @@ -396,6 +414,7 @@ def test_create_supervised_trainer(trainer_device, trace): _test_create_supervised_trainer(gradient_accumulation_steps=1, trainer_device=trainer_device, trace=trace) _test_create_supervised_trainer(gradient_accumulation_steps=3, trainer_device=trainer_device, trace=trace) _test_create_supervised_trainer(with_model_transform=True, trainer_device=trainer_device, trace=trace) + _test_create_supervised_trainer(with_model_fn=True, trainer_device=trainer_device, trace=trace) _test_create_mocked_supervised_trainer(trainer_device=trainer_device, trace=trace) @@ -580,6 +599,8 @@ def test_create_supervised_trainer_on_cuda_with_model_on_cpu(): def test_create_supervised_evaluator(): _test_create_supervised_evaluator() + _test_create_supervised_evaluator(with_model_transform=True) + _test_create_supervised_evaluator(with_model_fn=True) _test_mocked_supervised_evaluator() # older versions didn't have the autocast method so we skip the test for older builds diff --git a/tests/ignite/engine/test_custom_events.py b/tests/ignite/engine/test_custom_events.py index 3a19904a45f9..c4400396bf19 100644 --- a/tests/ignite/engine/test_custom_events.py +++ b/tests/ignite/engine/test_custom_events.py @@ -6,7 +6,24 @@ import ignite.distributed as idist from ignite.engine import Engine, Events -from ignite.engine.events import CallableEventWithFilter, EventEnum, EventsList +from ignite.engine.events import CallableEvents, CallableEventWithFilter, EventEnum, EventsList + + +def test_deprecated_callable_events_class(): + engine = Engine(lambda engine, batch: 0) + + with pytest.warns(DeprecationWarning, match=r"Class ignite\.engine\.events\.CallableEvents is deprecated"): + + class CustomEvents(CallableEvents, Enum): + TEST_EVENT = "test_event" + + def __new__(cls, value: str) -> "CallableEvents": + obj = CallableEvents.__new__(cls) + obj._value_ = value + return obj + + with pytest.raises(TypeError, match=r"Value at \d of event_names should be a str or EventEnum"): + engine.register_events(*CustomEvents) def test_custom_events(): @@ -234,7 +251,6 @@ def ef(e, i): ], ) def test_callable_events(event): - assert isinstance(event.value, str) def foo(engine, _): @@ -296,7 +312,6 @@ def bar(e): def test_remove_event_handler_on_callable_events(): - engine = Engine(lambda e, b: 1) def foo(e): @@ -324,11 +339,9 @@ def bar(e): def _test_every_event_filter_with_engine(device="cpu"): - data = torch.rand(100, 4, device=device) def _test(event_name, event_attr, every, true_num_calls): - engine = Engine(lambda e, b: b) counter = [0] @@ -382,7 +395,6 @@ def test_every_event_filter_with_engine(): ], ) def test_before_event_filter_with_engine(event_name, event_attr, before, expect_calls): - data = range(100) engine = Engine(lambda e, b: 1) @@ -410,7 +422,6 @@ def _before_event(): ], ) def test_after_event_filter_with_engine(event_name, event_attr, after, expect_calls): - data = range(100) engine = Engine(lambda e, b: 1) @@ -431,7 +442,6 @@ def _after_event(): [(Events.ITERATION_STARTED, "iteration", 300, 100, 199), (Events.EPOCH_COMPLETED, "epoch", 4, 1, 2)], ) def test_before_and_after_event_filter_with_engine(event_name, event_attr, before, after, expect_calls): - data = range(100) engine = Engine(lambda e, b: 1) @@ -452,7 +462,6 @@ def _before_and_after_event(): [(Events.ITERATION_STARTED, "iteration", 5, 25, 8, 4), (Events.EPOCH_COMPLETED, "epoch", 2, 5, 1, 2)], ) def test_every_before_and_after_event_filter_with_engine(event_name, event_attr, every, before, after, expect_calls): - data = range(100) engine = Engine(lambda e, b: 1) @@ -484,7 +493,6 @@ def _every_before_and_after_event(): ], ) def test_once_event_filter(event_name, event_attr, once, expect_calls): - data = list(range(100)) engine = Engine(lambda e, b: b) @@ -508,7 +516,6 @@ def assert_(engine): def test_custom_event_filter_with_engine(): - special_events = [1, 2, 5, 7, 17, 20] def custom_event_filter(engine, event): @@ -517,7 +524,6 @@ def custom_event_filter(engine, event): return False def _test(event_name, event_attr, true_num_calls): - engine = Engine(lambda e, b: b) num_calls = [0] @@ -539,7 +545,6 @@ def assert_on_special_event(engine): def test_callable_event_bad_behaviour(): - special_events = [1, 2, 5, 7, 17, 20] def custom_event_filter(engine, event): @@ -660,7 +665,6 @@ def test_every_event_filter_with_engine_with_dataloader(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_every_event_filter_with_engine(device) _test_every_event_filter_with_engine_with_dataloader(device) @@ -670,14 +674,12 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_every_event_filter_with_engine(device) _test_every_event_filter_with_engine_with_dataloader(device) def test_event_list(): - e1 = Events.ITERATION_STARTED(once=1) e2 = Events.ITERATION_STARTED(every=3) e3 = Events.COMPLETED @@ -693,7 +695,6 @@ def test_event_list(): def test_list_of_events(): def _test(event_list, true_iterations): - engine = Engine(lambda e, b: b) iterations = [] diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index 36a4a4371673..b2f62dfa111e 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -95,7 +95,6 @@ def test_reproducible_batch_sampler_wrong_input(): def test_reproducible_batch_sampler(): - data = list(range(100)) dataloader = DataLoader(data, batch_size=12, num_workers=0, shuffle=True, drop_last=True) @@ -125,7 +124,6 @@ def test_reproducible_batch_sampler(): def _test_keep_random_state(with_numpy): - manual_seed(54) true_values = [] for _ in range(5): @@ -175,7 +173,6 @@ def test_keep_random_state_without_numpy(): def test_strict_resume_from_iter(): def _test(epoch_length=None): - max_epochs = 5 num_iters = 21 torch.manual_seed(0) @@ -244,7 +241,6 @@ def update_fn(_, batch): def _test_resume_random_dataloader_from_epoch(device, _setup_sampler, sampler_type=None): def _test(epoch_length=None): - max_epochs = 5 total_batch_size = 4 num_iters = 21 @@ -255,7 +251,6 @@ def _test(epoch_length=None): epoch_length = num_iters for resume_epoch in range(1, max_epochs, 2): - for num_workers in [0, 2]: sampler, batch_size = _setup_sampler(sampler_type, num_iters, total_batch_size) @@ -361,9 +356,7 @@ def _test(epoch_length=None): epoch_length = num_iters for resume_iteration in range(2, min(num_iters * max_epochs, epoch_length * max_epochs), 13): - for num_workers in [0, 2]: - sampler, batch_size = _setup_sampler(sampler_type, num_iters, total_batch_size) orig_dataloader = DataLoader( data, @@ -513,7 +506,6 @@ def infinite_data_iterator(): epoch_length = num_iters for resume_iteration in range(1, min(num_iters * max_epochs, epoch_length * max_epochs), 7): - seen_batchs = [] def update_fn(_, batch): @@ -557,7 +549,6 @@ def test_resume_random_data_iterator_from_iter(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -566,7 +557,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -577,7 +567,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -587,14 +576,12 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") def test_concepts_snippet_resume(): - # Commented imports required in the snippet # import torch # from torch.utils.data import DataLoader @@ -659,7 +646,6 @@ def user_handler(_): def _test_gradients_on_resume( dirname, device, with_dropout=True, with_dataaugs=True, data_size=24, batch_size=4, save_iter=None, save_epoch=None ): - debug = False def random_train_data_loader(size): diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 994eb49f72bb..c09d13d88ee3 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -437,7 +437,6 @@ def update_fn(_, batch): _test(data) def test_state_repr(self): - data = [0, 1, 2, 3, 4, 5] max_epochs = 1 metrics = {"accuracy": Mock()} @@ -452,7 +451,6 @@ def test_state_repr(self): assert "batch" in s def test_alter_batch(self): - small_shape = (1, 2, 2) large_shape = (1, 3, 3) @@ -520,6 +518,9 @@ def test_run_asserts(self): with pytest.raises(ValueError, match=r"Input data has zero size. Please provide non-empty data"): engine.run([]) + with pytest.warns(UserWarning, match="Argument seed is deprecated"): + engine.run([0, 1, 2, 3, 4], seed=1234) + def test_state_get_event_attrib_value(self): state = State() state.iteration = 10 @@ -1026,47 +1027,6 @@ def switch_dataloader(): trainer.run(data1, max_epochs=10) - def test_run_with_max_iters(self): - max_iters = 8 - engine = Engine(lambda e, b: 1) - engine.run([0] * 20, max_iters=max_iters) - assert engine.state.iteration == max_iters - assert engine.state.max_iters == max_iters - - def test_run_with_max_iters_greater_than_epoch_length(self): - max_iters = 73 - engine = Engine(lambda e, b: 1) - engine.run([0] * 20, max_iters=max_iters) - assert engine.state.iteration == max_iters - - def test_run_with_invalid_max_iters_and_max_epoch(self): - max_iters = 12 - max_epochs = 2 - engine = Engine(lambda e, b: 1) - with pytest.raises( - ValueError, - match=r"Arguments max_iters and max_epochs are mutually exclusive." - "Please provide only max_epochs or max_iters.", - ): - engine.run([0] * 20, max_iters=max_iters, max_epochs=max_epochs) - - def test_epoch_events_fired_max_iters(self): - max_iters = 32 - engine = Engine(lambda e, b: 1) - - @engine.on(Events.EPOCH_COMPLETED) - def fired_event(engine): - assert engine.state.iteration % engine.state.epoch_length == 0 - - engine.run([0] * 10, max_iters=max_iters) - - def test_is_done_with_max_iters(self): - state = State(iteration=100, epoch=1, max_epochs=3, epoch_length=100, max_iters=250) - assert not Engine._is_done(state) - - state = State(iteration=250, epoch=1, max_epochs=3, epoch_length=100, max_iters=250) - assert Engine._is_done(state) - @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_batch_is_released_before_new_one_is_loaded_on_cuda(self): torch.cuda.empty_cache() diff --git a/tests/ignite/engine/test_engine_state_dict.py b/tests/ignite/engine/test_engine_state_dict.py index ce8e5aba0d48..4ccfb7ea7720 100644 --- a/tests/ignite/engine/test_engine_state_dict.py +++ b/tests/ignite/engine/test_engine_state_dict.py @@ -131,7 +131,6 @@ def test_load_state_dict_integration(): def test_load_state_dict_with_params_overriding_integration(): - state_dict = {"max_epochs": 100, "epoch_length": 120, "epoch": 5} data = range(120) @@ -205,7 +204,6 @@ def save_engine(_): def test_epoch_length(): def _test(data, max_epochs, num_iters): - batch_checker = BatchChecker(data) def update_fn(_, batch): @@ -219,7 +217,6 @@ def update_fn(_, batch): assert engine.state.epoch == max_epochs def _test_as_iter(data, max_epochs, num_iters): - batch_checker = BatchChecker(data) def update_fn(_, batch): diff --git a/tests/ignite/engine/test_event_handlers.py b/tests/ignite/engine/test_event_handlers.py index d3f4625604e4..1d0d71646a87 100644 --- a/tests/ignite/engine/test_event_handlers.py +++ b/tests/ignite/engine/test_event_handlers.py @@ -139,7 +139,6 @@ def test_adding_multiple_event_handlers(): ], ) def test_event_removable_handle(event1, event2): - # Removable handle removes event from engine. engine = Engine(lambda e, b: None) handler = create_autospec(spec=lambda x: None) @@ -227,7 +226,6 @@ def _handler(_): def test_events_list_removable_handle(): - # Removable handle removes event from engine. engine = DummyEngine() handler = create_autospec(spec=lambda x: None) @@ -495,7 +493,6 @@ def __call__(self, engine, e): def test_event_handlers_with_decoration(): - engine = Engine(lambda e, b: b) def decorated(fun): diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py index bf48e07399d2..05f2f9fc8cce 100644 --- a/tests/ignite/handlers/test_checkpoint.py +++ b/tests/ignite/handlers/test_checkpoint.py @@ -45,13 +45,15 @@ def forward(self, x): def test_checkpoint_wrong_input(): - with pytest.raises(TypeError, match=r"Argument `to_save` should be a dictionary"): Checkpoint(12, lambda x: x, "prefix") with pytest.raises(TypeError, match=r"Argument `to_save` should be a dictionary"): Checkpoint([12], lambda x: x, "prefix") + with pytest.raises(TypeError, match=r"should have `state_dict`"): + Checkpoint({"model": {"abc": 12}}, lambda x: x, "prefix") + to_save = {"model": model} with pytest.raises( @@ -63,25 +65,34 @@ def test_checkpoint_wrong_input(): with pytest.raises(TypeError, match=r"global_step_transform should be a function."): Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", global_step_transform=123) + with pytest.warns(UserWarning, match=r"Argument archived is deprecated"): + Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", archived=True) + with pytest.raises(ValueError, match=r"Cannot have key 'checkpointer' if `include_self` is True"): Checkpoint({"checkpointer": model}, lambda x: x, include_self=True) class ImmutableMapping(Mapping): + def __init__(self, d): + self._dict = d + def __getitem__(self, key): - return to_save[key] + return self._dict[key] def __iter__(self): - return iter(to_save) + return iter(self._dict) def __len__(self): - return len(to_save) + return len(self._dict) with pytest.raises(TypeError, match="If `include_self` is True, then `to_save` must be mutable"): - Checkpoint(ImmutableMapping(), lambda x: x, include_self=True) + Checkpoint(ImmutableMapping(to_save), lambda x: x, include_self=True) + checkpoint = Checkpoint(to_save, lambda x: x) + with pytest.raises(AttributeError, match="Checkpoint's `save_handler` should be of type `DiskSaver`"): + checkpoint.reload_objects(to_save) -def test_save_handler_as_str(dirname): +def test_save_handler_as_str(dirname): to_save = {"model": model} checkpointer = Checkpoint(to_save, save_handler=dirname) @@ -89,7 +100,6 @@ def test_save_handler_as_str(dirname): def test_checkpoint_score_function_wrong_output(): - to_save = {"model": model} checkpointer = Checkpoint(to_save, lambda x: x, score_function=lambda e: {"1": 1}, score_name="acc") @@ -160,7 +170,7 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name): assert save_handler.call_count == 1 fname = f"{name}_0.pt" - obj["checkpointer"] = OrderedDict([("saved", [(0, fname)])]) + obj["checkpointer"] = OrderedDict([("_saved", [(0, fname)])]) metadata = {"basename": name, "score_name": None, "priority": 0} save_handler.assert_called_with(obj, fname, metadata) @@ -180,7 +190,7 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name): save_handler.remove.assert_called_with(f"{name}_0.pt") fname = f"{name}_1234.pt" - obj["checkpointer"] = OrderedDict([("saved", [(1234, fname)])]) + obj["checkpointer"] = OrderedDict([("_saved", [(1234, fname)])]) save_handler.assert_called_with(obj, fname, metadata) assert save_handler.remove.call_count == 1 @@ -188,7 +198,6 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name): def test_checkpoint_with_dp(): - dp_model = nn.DataParallel(model) to_save = {"model": dp_model} @@ -550,12 +559,21 @@ def test_model_checkpoint_args_validation(dirname): with pytest.raises(ValueError, match=r"with extension '.pt' are already present "): ModelCheckpoint(nonempty, _PREFIX) + with pytest.raises(ValueError, match=r"Argument save_interval is deprecated and should be None"): + ModelCheckpoint(existing, _PREFIX, save_interval=42) + with pytest.raises(ValueError, match=r"Directory path '\S+' is not found"): ModelCheckpoint(dirname / "non_existing_dir", _PREFIX, create_dir=False) + with pytest.raises(ValueError, match=r"Argument save_as_state_dict is deprecated and should be True"): + ModelCheckpoint(existing, _PREFIX, create_dir=False, save_as_state_dict=False) + with pytest.raises(TypeError, match=r"global_step_transform should be a function"): ModelCheckpoint(existing, _PREFIX, create_dir=False, global_step_transform=1234) + with pytest.warns(UserWarning, match=r"Argument archived is deprecated"): + ModelCheckpoint(existing, _PREFIX, create_dir=False, archived=True) + h = ModelCheckpoint(dirname, _PREFIX, create_dir=False) assert h.last_checkpoint is None with pytest.raises(RuntimeError, match=r"No objects to checkpoint found."): @@ -584,7 +602,6 @@ def test_model_checkpoint_simple_recovery(dirname): @pytest.mark.parametrize("ext, require_empty", [(".txt", True), (".pt", False)]) def test_model_checkpoint_simple_recovery_from_existing_non_empty(ext, require_empty, dirname): - previous_fname = dirname / f"{_PREFIX}_obj_{1}{ext}" with open(previous_fname, "w") as f: f.write("test") @@ -624,13 +641,11 @@ def test_model_checkpoint_invalid_save_handler(dirname): def test_disk_saver_atomic(dirname): - model = DummyModel() to_save_serializable = {"model": model} to_save_non_serializable = {"model": lambda x: x} def _test_existence(atomic, _to_save, expected): - saver = DiskSaver(dirname, atomic=atomic, create_dir=False, require_empty=False) fname = "test.pt" try: @@ -685,7 +700,6 @@ def test_disk_saver_unknown_keyword(dirname): def test_last_k(dirname): - h = ModelCheckpoint(dirname, _PREFIX, create_dir=False, n_saved=2) engine = Engine(lambda e, b: None) engine.state = State(epoch=0, iteration=0) @@ -704,7 +718,6 @@ def test_last_k(dirname): def test_disabled_n_saved(dirname): - h = ModelCheckpoint(dirname, _PREFIX, create_dir=False, n_saved=None) engine = Engine(lambda e, b: None) engine.state = State(epoch=0, iteration=0) @@ -857,7 +870,6 @@ def test_valid_state_dict_save(dirname): def _test_save_model_optimizer_lr_scheduler_with_state_dict(device, dirname, just_on_zero_rank=False): - torch.manual_seed(23) model = DummyModel().to(device) @@ -873,7 +885,6 @@ def update_fn(engine, batch): # Probably related to https://github.com/pytorch/xla/issues/2576 # loss = y.pow(2.0).sum() loss = y.sum() - print(loss.device, y.device, x.device) loss.backward() if idist.has_xla_support: import torch_xla.core.xla_model as xm @@ -946,7 +957,6 @@ def _test_save_model_optimizer_lr_scheduler_with_validation(device, dirname, jus torch.manual_seed(23) def _build_objects(acc_list): - model = DummyModel().to(device) optim = torch.optim.SGD(model.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5) @@ -1075,17 +1085,19 @@ def test_save_model_optimizer_lr_scheduler_with_validation(dirname): def test_checkpoint_load_objects(): - with pytest.raises(TypeError, match=r"Argument checkpoint should be a string or a dictionary"): Checkpoint.load_objects({}, []) with pytest.raises(TypeError, match=r"should have `load_state_dict` method"): Checkpoint.load_objects({"a": None}, {"a": None}) + with pytest.raises(TypeError, match=r"should have `load_state_dict` method"): + Checkpoint.load_objects({"a": {"b": None}}, {"a": {"b": None}}) + model = DummyModel() to_load = {"model": model, "another_model": model} - with pytest.raises(ValueError, match=r"from `to_load` is not found in the checkpoint"): + with pytest.raises(ValueError, match=r"Key 'model' from x is not found in y"): Checkpoint.load_objects(to_load, {}) model = DummyModel() @@ -1096,6 +1108,11 @@ def test_checkpoint_load_objects(): Checkpoint.load_objects(to_load, chkpt) assert model.state_dict() == model2.state_dict() + chkpt = {"models": [{"model1": {"abc": model.state_dict()}}, model.state_dict()]} + to_load = {"models": [{"model1": {"abc": model}}, model]} + Checkpoint.load_objects(to_load, chkpt) + assert model.state_dict() == model2.state_dict() + def test_checkpoint_load_objects_from_saved_file(dirname): def _get_single_obj_to_save(): @@ -1107,7 +1124,11 @@ def _get_multiple_objs_to_save(): model = DummyModel() optim = torch.optim.SGD(model.parameters(), lr=0.001) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5) - to_save = {"model": model, "optimizer": optim, "lr_scheduler": lr_scheduler} + to_save = { + "model": model, + "optimizer": optim, + "lr_scheduler": lr_scheduler, + } return to_save trainer = Engine(lambda e, b: None) @@ -1181,9 +1202,7 @@ def test_load_checkpoint_with_different_num_classes(dirname): with pytest.raises(RuntimeError): Checkpoint.load_objects(to_load_single_object, loaded_checkpoint) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UserWarning) - Checkpoint.load_objects(to_load_single_object, loaded_checkpoint, strict=False, blah="blah") + Checkpoint.load_objects(to_load_single_object, loaded_checkpoint, strict=False) loaded_weights = to_load_single_object["pretrained_features"].state_dict()["weight"] @@ -1191,7 +1210,6 @@ def test_load_checkpoint_with_different_num_classes(dirname): def test_disksaver_wrong_input(dirname): - with pytest.raises(ValueError, match=r"Directory path '\S+' is not found"): DiskSaver("/tmp/non-existing-folder", create_dir=False) @@ -1244,7 +1262,6 @@ def _test_checkpoint_load_objects_ddp(device): def _test_checkpoint_with_ZeRO(device, dirname, local_rank): - from torch.distributed.optim import ZeroRedundancyOptimizer model = DummyModel().to(device) @@ -1265,7 +1282,6 @@ def _test_checkpoint_with_ZeRO(device, dirname, local_rank): mocked_opt.consolidate_state_dict.assert_called_once_with(to=1) if local_rank == 1: - loaded_state_dict = torch.load(dirname / "checkpoint_0.pt", map_location=device)["optim"] state_dict = opt.state_dict() assert loaded_state_dict == state_dict @@ -1274,7 +1290,6 @@ def _test_checkpoint_with_ZeRO(device, dirname, local_rank): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, dirname, get_rank_zero_dirname, local_rank): - device = idist.device() rank_zero_dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, rank_zero_dirname / "1") @@ -1292,7 +1307,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, dirname, @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname): - device = idist.device() dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, dirname / "1") @@ -1305,7 +1319,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_di @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor, get_rank_zero_dirname): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() dirname = get_rank_zero_dirname() @@ -1430,7 +1443,6 @@ def _test_model_checkpoint_filename_pattern_helper( @pytest.mark.parametrize("test_class", ["checkpoint", "model_checkpoint"]) def test_checkpoint_filename_pattern(test_class, dirname): - if test_class == "checkpoint": _test = _test_checkpoint_filename_pattern_helper elif test_class == "model_checkpoint": @@ -1627,10 +1639,10 @@ def _setup_checkpoint(): def test_checkpoint_state_dict(): checkpointer = _setup_checkpoint() sd = checkpointer.state_dict() - assert "saved" in sd - assert isinstance(sd["saved"], list) and len(sd["saved"]) == len(checkpointer._saved) + assert "_saved" in sd + assert isinstance(sd["_saved"], list) and len(sd["_saved"]) == len(checkpointer._saved) - for saved_item, true_item in zip(sd["saved"], checkpointer._saved): + for saved_item, true_item in zip(sd["_saved"], checkpointer._saved): assert saved_item[0] == true_item.priority assert saved_item[1] == true_item.filename @@ -1643,11 +1655,43 @@ def test_checkpoint_load_state_dict(): to_save = {"model": model} checkpointer = Checkpoint(to_save, save_handler=save_handler, n_saved=None) - sd = {"saved": [(0, "model_0.pt"), (10, "model_10.pt"), (20, "model_20.pt")]} + sd = {"_saved": [(0, "model_0.pt"), (10, "model_10.pt"), (20, "model_20.pt")]} checkpointer.load_state_dict(sd) assert checkpointer._saved == true_checkpointer._saved +@pytest.mark.parametrize( + "to_save", + [ + {"model": DummyModel()}, + {"model": [DummyModel(), DummyModel()]}, + {"model": {"a": {"b": DummyModel()}}}, + ], +) +def test_checkpoint__setup_checkpoint(to_save): + save_handler = MagicMock(spec=BaseSaveHandler) + checkpointer = Checkpoint(to_save, save_handler=save_handler, n_saved=2) + checkpoint = checkpointer._setup_checkpoint() + + assert isinstance(checkpoint, dict) + for k, obj in to_save.items(): + assert k in checkpoint + if isinstance(obj, torch.nn.Module): + assert checkpoint[k] == obj.state_dict() + elif isinstance(obj, list): + for c2, obj2 in zip(checkpoint[k], obj): + assert c2 == obj2.state_dict() + elif isinstance(obj, dict): + c2 = checkpoint[k] + for k2, obj2 in obj.items(): + if isinstance(obj2, torch.nn.Module): + assert c2[k2] == obj2.state_dict() + elif isinstance(obj2, dict): + c3 = c2[k2] + for k3, obj3 in obj2.items(): + assert c3[k3] == obj3.state_dict() + + def test_checkpoint_fixed_filename(): model = DummyModel() to_save = {"model": model} @@ -1783,7 +1827,6 @@ def score_function(_): def test_get_default_score_fn(): - with pytest.raises(ValueError, match=r"Argument score_sign should be 1 or -1"): Checkpoint.get_default_score_fn("acc", 2.0) @@ -1823,7 +1866,6 @@ def test_load_single_object(obj_to_save, dirname): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.parametrize("atomic", [False, True]) def test_disksaver_distrib(distributed_context_single_node_gloo, dirname, local_rank, atomic): - saver = DiskSaver(dirname, atomic, save_on_rank=1) mocked_saver = MagicMock(wraps=saver) diff --git a/tests/ignite/handlers/test_early_stopping.py b/tests/ignite/handlers/test_early_stopping.py index 7382c7ec1b20..25fca8a9c468 100644 --- a/tests/ignite/handlers/test_early_stopping.py +++ b/tests/ignite/handlers/test_early_stopping.py @@ -13,7 +13,6 @@ def do_nothing_update_fn(engine, batch): def test_args_validation(): - trainer = Engine(do_nothing_update_fn) with pytest.raises(ValueError, match=r"Argument patience should be positive integer."): @@ -30,7 +29,6 @@ def test_args_validation(): def test_simple_early_stopping(): - scores = iter([1.0, 0.8, 0.88]) def score_function(engine): @@ -50,7 +48,6 @@ def score_function(engine): def test_state_dict(): - scores = iter([1.0, 0.8, 0.88]) def score_function(engine): @@ -75,7 +72,6 @@ def score_function(engine): def test_early_stopping_on_delta(): - scores = iter([1.0, 2.0, 2.01, 3.0, 3.01, 3.02]) trainer = Engine(do_nothing_update_fn) @@ -98,7 +94,6 @@ def test_early_stopping_on_delta(): def test_early_stopping_on_last_event_delta(): - scores = iter([0.0, 0.3, 0.6]) trainer = Engine(do_nothing_update_fn) @@ -117,7 +112,6 @@ def test_early_stopping_on_last_event_delta(): def test_early_stopping_on_cumulative_delta(): - scores = iter([0.0, 0.3, 0.6]) trainer = Engine(do_nothing_update_fn) @@ -151,7 +145,6 @@ def score_function(engine): def test_simple_no_early_stopping(): - scores = iter([1.0, 0.8, 1.2]) def score_function(engine): @@ -248,7 +241,6 @@ def evaluation(engine): def _test_distrib_with_engine_early_stopping(device): - if device is None: device = idist.device() if isinstance(device, str): @@ -287,7 +279,6 @@ def evaluation(engine): def _test_distrib_integration_engine_early_stopping(device): - from ignite.metrics import Accuracy if device is None: @@ -346,7 +337,6 @@ def evaluation(engine): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -355,7 +345,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -365,7 +354,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -377,7 +365,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -387,7 +374,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) diff --git a/tests/ignite/handlers/test_handlers.py b/tests/ignite/handlers/test_handlers.py index f10b28e997b1..0357837fa65a 100644 --- a/tests/ignite/handlers/test_handlers.py +++ b/tests/ignite/handlers/test_handlers.py @@ -5,7 +5,6 @@ def test_global_step_from_engine(): - iteration = 12 epoch = 23 diff --git a/tests/ignite/handlers/test_lr_finder.py b/tests/ignite/handlers/test_lr_finder.py index c966c8c3f1dd..159acd76eab5 100644 --- a/tests/ignite/handlers/test_lr_finder.py +++ b/tests/ignite/handlers/test_lr_finder.py @@ -159,7 +159,6 @@ def mnist_dataloader(): def test_attach_incorrect_input_args(lr_finder, dummy_engine, model, optimizer, dataloader): - with pytest.raises(TypeError, match=r"Argument to_save should be a mapping"): with lr_finder.attach(dummy_engine, to_save=123): pass @@ -249,7 +248,6 @@ def test_with_attach(lr_finder, to_save, dummy_engine, dataloader): def test_wrong_values_start_lr_and_end_lr( lr_finder, dummy_engine, to_save, dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups ): - with pytest.raises(ValueError, match=r"start_lr must be less than end_lr"): with lr_finder.attach(dummy_engine, to_save=to_save, start_lr=10.0, end_lr=1.0): pass @@ -322,7 +320,6 @@ def assert_output_sizes(lr_finder, dummy_engine): def test_num_iter_is_none(lr_finder, to_save, dummy_engine, dataloader): - with pytest.warns(UserWarning, match=r"Run completed without loss diverging"): with lr_finder.attach(dummy_engine, to_save=to_save, diverge_th=float("inf")) as trainer_with_finder: trainer_with_finder.run(dataloader) @@ -331,7 +328,6 @@ def test_num_iter_is_none(lr_finder, to_save, dummy_engine, dataloader): def test_num_iter_is_enough(lr_finder, to_save, dummy_engine, dataloader): - with pytest.warns(UserWarning, match=r"Run completed without loss diverging"): with lr_finder.attach( dummy_engine, to_save=to_save, num_iter=50, diverge_th=float("inf") @@ -348,7 +344,7 @@ def test_num_iter_is_not_enough(lr_finder, to_save, dummy_engine, dataloader): trainer_with_finder.run(dataloader) assert_output_sizes(lr_finder, dummy_engine) assert dummy_engine.state.iteration != len(dataloader) - assert dummy_engine.state.iteration == 150 + assert dummy_engine.state.iteration == 150 + 1 def test_detach_terminates(lr_finder, to_save, dummy_engine, dataloader): @@ -409,13 +405,13 @@ def test_engine_output_type(lr_finder, dummy_engine, optimizer): lr_finder._history = {"lr": [], "loss": []} lr_finder._log_lr_and_loss(dummy_engine, output_transform=lambda x: x, smooth_f=0, diverge_th=1) loss = lr_finder._history["loss"][-1] - assert type(loss) == float + assert type(loss) is float dummy_engine.state.output = torch.tensor([10.0], dtype=torch.float32) lr_finder._history = {"lr": [], "loss": []} lr_finder._log_lr_and_loss(dummy_engine, output_transform=lambda x: x, smooth_f=0, diverge_th=1) loss = lr_finder._history["loss"][-1] - assert type(loss) == float + assert type(loss) is float def test_lr_suggestion_unexpected_curve(lr_finder, to_save, dummy_engine, dataloader): @@ -460,11 +456,9 @@ def test_lr_suggestion_multiple_param_groups(lr_finder): def test_lr_suggestion_mnist(lr_finder, mnist_to_save, dummy_engine_mnist, mnist_dataloader): - max_iters = 50 with lr_finder.attach(dummy_engine_mnist, mnist_to_save, diverge_th=2, step_mode="linear") as trainer_with_finder: - with trainer_with_finder.add_event_handler( Events.ITERATION_COMPLETED(once=max_iters), lambda _: trainer_with_finder.terminate() ): @@ -476,7 +470,6 @@ def test_lr_suggestion_mnist(lr_finder, mnist_to_save, dummy_engine_mnist, mnist def test_apply_suggested_lr_unmatched_optimizers( lr_finder, mnist_to_save, dummy_engine_mnist, optimizer_multiple_param_groups, mnist_dataloader ): - with lr_finder.attach(dummy_engine_mnist, mnist_to_save) as trainer_with_finder: trainer_with_finder.run(mnist_dataloader) @@ -489,7 +482,6 @@ def test_apply_suggested_lr_unmatched_optimizers( def test_apply_suggested_lr_single_param_groups( lr_finder, mnist_to_save, dummy_engine_mnist, mnist_optimizer, mnist_dataloader ): - with lr_finder.attach(dummy_engine_mnist, mnist_to_save) as trainer_with_finder: trainer_with_finder.run(mnist_dataloader) @@ -506,7 +498,6 @@ def test_apply_suggested_lr_multiple_param_groups( optimizer_multiple_param_groups, dataloader_plot, ): - with lr_finder.attach(dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups) as trainer_with_finder: trainer_with_finder.run(dataloader_plot) @@ -518,13 +509,11 @@ def test_apply_suggested_lr_multiple_param_groups( def test_no_matplotlib(no_site_packages, lr_finder): - with pytest.raises(ModuleNotFoundError, match=r"This method requires matplotlib to be installed"): lr_finder.plot() def test_plot_single_param_group(dirname, lr_finder, mnist_to_save, dummy_engine_mnist, mnist_dataloader): - with lr_finder.attach(dummy_engine_mnist, mnist_to_save, end_lr=20.0, smooth_f=0.04) as trainer_with_finder: trainer_with_finder.run(mnist_dataloader) @@ -553,7 +542,6 @@ def _test(ax): def test_plot_multiple_param_groups( dirname, lr_finder, to_save_mulitple_param_groups, dummy_engine_mulitple_param_groups, dataloader_plot ): - with lr_finder.attach( dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups, end_lr=20.0, smooth_f=0.04 ) as trainer_with_finder: @@ -654,7 +642,6 @@ def forward(self, x): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): - device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(dirname, device) @@ -664,7 +651,6 @@ def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl): - device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(dirname, device) diff --git a/tests/ignite/handlers/test_param_scheduler.py b/tests/ignite/handlers/test_param_scheduler.py index fd123efeecf8..27348c9f1e67 100644 --- a/tests/ignite/handlers/test_param_scheduler.py +++ b/tests/ignite/handlers/test_param_scheduler.py @@ -3,7 +3,7 @@ import numpy as np import pytest import torch -from torch.optim.lr_scheduler import ExponentialLR, StepLR +from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ExponentialLR, StepLR from ignite.engine import Engine, Events from ignite.handlers.param_scheduler import ( @@ -36,7 +36,6 @@ def get_param(self): def test_param_scheduler_asserts(): - t1 = torch.zeros([1], requires_grad=True) t2 = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}]) @@ -56,8 +55,7 @@ def test_param_scheduler_asserts(): FakeParamScheduler({}, "lr") -def test_linear_scheduler(): - +def test_linear_scheduler_asserts(): with pytest.raises(TypeError, match=r"Argument optimizer should be torch.optim.Optimizer"): LinearCyclicalScheduler({}, "lr", 1, 0, cycle_size=0) @@ -70,6 +68,11 @@ def test_linear_scheduler(): with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"): LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=1) + +def test_linear_scheduler(): + tensor = torch.zeros([1], requires_grad=True) + optimizer = torch.optim.SGD([tensor], lr=0.0) + scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10) state_dict = scheduler.state_dict() @@ -79,38 +82,12 @@ def save_lr(engine): trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) - + lr_values_in_cycle = [1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8] for _ in range(2): lrs = [] - trainer.run([0] * 9, max_epochs=2) + trainer.run([0] * 10, max_epochs=2) - assert lrs == list( - map( - pytest.approx, - [ - # Cycle 1 - 1.0, - 0.8, - 0.6, - 0.4, - 0.2, - 0.0, - 0.2, - 0.4, - 0.6, - 0.8, - # Cycle 2 - 1.0, - 0.8, - 0.6, - 0.4, - 0.2, - 0.0, - 0.2, - 0.4, # 0.6, 0.8, - ], - ) - ) + assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_cycle]) scheduler.load_state_dict(state_dict) optimizer = torch.optim.SGD([tensor], lr=0) @@ -166,49 +143,6 @@ def save_lr(engine): ) scheduler.load_state_dict(state_dict) - # With float cycle_size - optimizer = torch.optim.SGD([tensor], lr=0) - scheduler = LinearCyclicalScheduler( - optimizer, "lr", start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0 - ) - state_dict = scheduler.state_dict() - - trainer = Engine(lambda engine, batch: None) - trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) - trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) - - for _ in range(2): - lrs = [] - trainer.run([0] * 9, max_epochs=2) - assert lrs == list( - map( - pytest.approx, - [ - # Cycle 1 - 1.2, - 1.0, - 0.8, - 0.6, - 0.4, - 0.2, - 0.4, - 0.6, - 0.8, - 1.0, - # Cycle 2 - 1.2, - 1.0, - 0.8, - 0.6, - 0.4, - 0.2, - 0.4, - 0.6, # 0.8, 1.0, - ], - ) - ) - scheduler.load_state_dict(state_dict) - def test_linear_scheduler_cycle_size_two(): tensor = torch.zeros([1], requires_grad=True) @@ -241,17 +175,23 @@ def save_lr(engine): assert lrs == pytest.approx([v for i, v in simulated_values]) -def test_cosine_annealing_scheduler(): +@pytest.mark.parametrize("cyclic_warmup", [False, True]) +def test_cosine_annealing_scheduler(cyclic_warmup): tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) - scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10) + scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10, warmup_duration=2 if cyclic_warmup else 0) state_dict = scheduler.state_dict() - data = [0] * 9 + data = [0] * (10 + int(cyclic_warmup)) max_epochs = 2 simulated_values = CosineAnnealingScheduler.simulate_values( - num_events=len(data) * max_epochs, param_name="lr", start_value=0, end_value=1, cycle_size=10 + num_events=len(data) * max_epochs, + param_name="lr", + start_value=0, + end_value=1, + cycle_size=10, + warmup_duration=2 if cyclic_warmup else 0, ) def save_lr(engine): @@ -260,43 +200,31 @@ def save_lr(engine): trainer = Engine(lambda engine, batch: None) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr) + lr_values_in_cycle = [ + 0.0, + 0.02447174185242318, + 0.09549150281252627, + 0.20610737385376332, + 0.3454915028125263, + 0.5, + 0.6545084971874737, + 0.7938926261462365, + 0.9045084971874737, + 0.9755282581475768, + ] + lr_values_in_warmup = np.linspace(1.0, 0.0, 2 + 1)[:-1].tolist() if cyclic_warmup else [] for _ in range(2): lrs = [] trainer.run(data, max_epochs=max_epochs) - assert lrs == list( - map( - pytest.approx, - [ - 0.0, - 0.02447174185242318, - 0.09549150281252627, - 0.20610737385376332, - 0.3454915028125263, - 0.5, - 0.6545084971874737, - 0.7938926261462365, - 0.9045084971874737, - 0.9755282581475768, - 0.0, - 0.02447174185242318, - 0.09549150281252627, - 0.20610737385376332, - 0.3454915028125263, - 0.5, - 0.6545084971874737, - 0.7938926261462365, # 0.9045084971874737, 0.9755282581475768 - ], - ) - ) + assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_warmup, *lr_values_in_cycle]) scheduler.load_state_dict(state_dict) assert lrs == pytest.approx([v for i, v in simulated_values]) def test_concat_scheduler_asserts(): - tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) @@ -620,7 +548,6 @@ def save_lr(engine): def test_lr_scheduler_asserts(): - err_msg = r"Argument lr_scheduler should be a subclass of torch.optim.lr_scheduler.(_LRScheduler|LRScheduler)" with pytest.raises(TypeError, match=err_msg): LRScheduler(123) @@ -638,7 +565,6 @@ def test_lr_scheduler_asserts(): ], ) def test_lr_scheduler(torch_lr_scheduler_cls, kwargs): - if torch_lr_scheduler_cls is None: return @@ -655,7 +581,7 @@ def test_lr_scheduler(torch_lr_scheduler_cls, kwargs): state_dict1 = scheduler1.state_dict() torch_lr_scheduler2 = torch_lr_scheduler_cls(optimizer=optimizer2, **kwargs) - with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it is will be skipped"): + with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it will be skipped"): scheduler2 = LRScheduler(torch_lr_scheduler2, use_legacy=True) state_dict2 = scheduler2.state_dict() @@ -713,7 +639,6 @@ def torch_lr_scheduler_step(engine): def test_piecewiselinear_asserts(): - tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) @@ -738,7 +663,6 @@ def test_piecewiselinear_asserts(): @pytest.mark.parametrize("milestones_as_np_int", [True, False]) def test_piecewiselinear(milestones_as_np_int): - tensor = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([tensor], lr=0) @@ -821,13 +745,11 @@ def save_lr(engine): def test_simulate_and_plot_values(): - import matplotlib matplotlib.use("Agg") def _test(scheduler_cls, **scheduler_kwargs): - if scheduler_cls == LRScheduler: optimizer = scheduler_kwargs["lr_scheduler"].optimizer elif scheduler_cls == ConcatScheduler: @@ -914,7 +836,6 @@ def save_lr(engine): def test_create_lr_scheduler_with_warmup_asserts(): - with pytest.raises(TypeError, match=r"Argument lr_scheduler should be a subclass of"): create_lr_scheduler_with_warmup(12, warmup_start_value=0.0, warmup_end_value=0.1, warmup_duration=10) @@ -966,7 +887,6 @@ def test_create_lr_scheduler_with_warmup_asserts(): def test_create_lr_scheduler_with_warmup( lr_scheduler_name, warmup_start_value, warmup_end_value, warmup_duration, warmup_end_next_value ): - t1 = torch.zeros([1], requires_grad=True) if lr_scheduler_name == "ExponentialLR": @@ -1091,7 +1011,6 @@ def save_lr(engine): def test_create_lr_scheduler_with_warmup_with_real_model(dummy_model_factory): - model = dummy_model_factory(with_grads=False, with_frozen_layer=False) init_lr = 0.01 optimizer = torch.optim.SGD(model.parameters(), lr=init_lr) @@ -1118,7 +1037,6 @@ def test_create_lr_scheduler_with_warmup_with_real_model(dummy_model_factory): def test_param_group_scheduler_asserts(): - t1 = torch.zeros([1], requires_grad=True) t2 = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}]) @@ -1169,7 +1087,6 @@ def test_param_group_scheduler_asserts(): @pytest.mark.parametrize("param_groups_setting", ["single_optim", "multi_optim"]) def test_param_group_scheduler(param_groups_setting): - t1 = torch.zeros([1], requires_grad=True) t2 = torch.zeros([1], requires_grad=True) if param_groups_setting == "single_optim": @@ -1234,7 +1151,6 @@ def save_lr(_, lrs): ], ) def test_scheduler_with_param_groups(scheduler_cls, kwargs): - t1 = torch.zeros([1], requires_grad=True) t2 = torch.zeros([1], requires_grad=True) optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}]) @@ -1377,3 +1293,45 @@ def test_reduce_lr_on_plateau_scheduler_asserts(): with pytest.raises(ValueError, match=r"Length of argument metric_values should be equal to num_events."): metric_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] ReduceLROnPlateauScheduler.simulate_values(5, metric_values, 0.01) + + +@pytest.mark.parametrize("warmup_end_value", [0.23, None]) +@pytest.mark.parametrize("T_0", [1, 12]) +@pytest.mark.parametrize("T_mult", [1, 3]) +def test_create_lr_scheduler_with_warmup_cosine(warmup_end_value, T_0, T_mult): + lr = 0.2 + steps = 200 + warm_steps = 50 + warm_start = 0.023 + + def get_optim(): + t1 = torch.zeros([1], requires_grad=True) + return torch.optim.SGD([t1], lr=lr) + + def get_cos_shed(): + return CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=T_mult) + + optimizer = get_optim() + scheduler = get_cos_shed() + cosine_lrs = [] + for i in range(steps): + cosine_lrs.append(optimizer.param_groups[0]["lr"]) + scheduler.step() + + optimizer = get_optim() + scheduler = create_lr_scheduler_with_warmup( + get_cos_shed(), warmup_start_value=warm_start, warmup_end_value=warmup_end_value, warmup_duration=warm_steps + ) + + warm_lrs = [] + real_warm_steps = warm_steps if warmup_end_value is not None else (warm_steps - 1) + for epoch in range(real_warm_steps + steps): + scheduler(None) + warm_lrs.append(optimizer.param_groups[0]["lr"]) + + if warmup_end_value is not None: + np.testing.assert_allclose(np.linspace(warm_start, warmup_end_value, warm_steps), warm_lrs[:warm_steps]) + assert warm_lrs[real_warm_steps:] == cosine_lrs + else: + np.testing.assert_allclose(np.linspace(warm_start, lr, warm_steps), warm_lrs[:warm_steps]) + assert warm_lrs[real_warm_steps:] == cosine_lrs diff --git a/tests/ignite/handlers/test_state_param_scheduler.py b/tests/ignite/handlers/test_state_param_scheduler.py index b249136e47c5..b907683d7e0d 100644 --- a/tests/ignite/handlers/test_state_param_scheduler.py +++ b/tests/ignite/handlers/test_state_param_scheduler.py @@ -141,7 +141,6 @@ def test_pwlinear_scheduler_max_value(max_epochs, milestones_values): def test_piecewiselinear_asserts(): - with pytest.raises(TypeError, match=r"Argument milestones_values should be a list or tuple"): PiecewiseLinearStateScheduler(param_name="linear_scheduled_param", milestones_values=None) @@ -169,7 +168,7 @@ def test_exponential_scheduler(max_epochs, initial_value, gamma): ) exp_state_parameter_scheduler.attach(engine, Events.EPOCH_COMPLETED) engine.run([0] * 8, max_epochs=max_epochs) - torch_testing_assert_close(getattr(engine.state, "exp_scheduled_param"), initial_value * gamma ** max_epochs) + torch_testing_assert_close(getattr(engine.state, "exp_scheduled_param"), initial_value * gamma**max_epochs) state_dict = exp_state_parameter_scheduler.state_dict() exp_state_parameter_scheduler.load_state_dict(state_dict) @@ -222,7 +221,6 @@ def test_multistep_scheduler(max_epochs, initial_value, gamma, milestones): def test_custom_scheduler(): - engine = Engine(lambda e, b: None) class LambdaState: @@ -264,7 +262,6 @@ def __init__(self, initial_value, gamma): @pytest.mark.parametrize("scheduler_cls, scheduler_kwargs", [config3, config4, config5, config6]) def test_simulate_and_plot_values(scheduler_cls, scheduler_kwargs): - import matplotlib matplotlib.use("Agg") @@ -285,7 +282,6 @@ def test_simulate_and_plot_values(scheduler_cls, scheduler_kwargs): @pytest.mark.parametrize("save_history", [False, True]) @pytest.mark.parametrize("scheduler_cls, scheduler_kwargs", [config3, config4, config5, config6]) def test_simulate_values(scheduler_cls, scheduler_kwargs, save_history): - max_epochs = 2 data = [0] * 10 scheduler_kwargs["save_history"] = save_history @@ -293,7 +289,6 @@ def test_simulate_values(scheduler_cls, scheduler_kwargs, save_history): def test_torch_save_load(dirname): - lambda_state_parameter_scheduler = LambdaStateScheduler( param_name="custom_scheduled_param", lambda_obj=LambdaState(initial_value=10, gamma=0.99), create_new=True ) @@ -321,7 +316,6 @@ def test_torch_save_load(dirname): def test_simulate_and_plot_values_no_matplotlib(): - with pytest.raises(ModuleNotFoundError, match=r"This method requires matplotlib to be installed."): with patch.dict("sys.modules", {"matplotlib.pyplot": None}): event = Events.EPOCH_COMPLETED @@ -477,7 +471,6 @@ def test_param_scheduler_attach_warning(): def test_param_scheduler_with_ema_handler(): - from ignite.handlers import EMAHandler model = nn.Linear(2, 1) diff --git a/tests/ignite/handlers/test_terminate_on_nan.py b/tests/ignite/handlers/test_terminate_on_nan.py index e231a2b48f41..c7db4745e57f 100644 --- a/tests/ignite/handlers/test_terminate_on_nan.py +++ b/tests/ignite/handlers/test_terminate_on_nan.py @@ -23,7 +23,6 @@ ], ) def test_terminate_on_nan_and_inf(state_output, should_terminate): - torch.manual_seed(12) def update_fn(engine, batch): @@ -41,7 +40,6 @@ def update_fn(engine, batch): def test_with_terminate_on_nan(): - torch.manual_seed(12) data = [1.0, 0.8, (torch.rand(4, 4), torch.rand(4, 4)), torch.rand(5), torch.asin(torch.randn(4, 4)), 0.0, 1.0] @@ -58,7 +56,6 @@ def update_fn(engine, batch): def test_with_terminate_on_inf(): - torch.manual_seed(12) data = [ @@ -84,7 +81,6 @@ def update_fn(engine, batch): def test_without_terminate_on_nan_inf(): - data = [1.0, 0.8, torch.rand(4, 4), (torch.rand(5), torch.rand(5, 4)), 0.0, 1.0] def update_fn(engine, batch): diff --git a/tests/ignite/handlers/test_time_limit.py b/tests/ignite/handlers/test_time_limit.py index d0d9c1889b84..d82a426d259f 100644 --- a/tests/ignite/handlers/test_time_limit.py +++ b/tests/ignite/handlers/test_time_limit.py @@ -7,7 +7,6 @@ def test_arg_validation(): - with pytest.raises(ValueError, match=r"Argument limit_sec should be a positive integer."): TimeLimit(limit_sec=-5) diff --git a/tests/ignite/handlers/test_time_profilers.py b/tests/ignite/handlers/test_time_profilers.py index 978f193ccaae..7029f7e99b52 100644 --- a/tests/ignite/handlers/test_time_profilers.py +++ b/tests/ignite/handlers/test_time_profilers.py @@ -830,7 +830,6 @@ def test_write_results_handlers_profiler(dirname): def test_print_results_basic_profiler(capsys): - true_max_epochs = 1 true_num_iters = 5 @@ -848,7 +847,6 @@ def test_print_results_basic_profiler(capsys): def test_print_results_handlers_profiler_handlers_profiler(capsys): - true_max_epochs = 1 true_num_iters = 5 diff --git a/tests/ignite/metrics/gan/test_fid.py b/tests/ignite/metrics/gan/test_fid.py index de0b379d24cd..0da5b574340e 100644 --- a/tests/ignite/metrics/gan/test_fid.py +++ b/tests/ignite/metrics/gan/test_fid.py @@ -19,7 +19,6 @@ def mock_no_scipy(): def test_no_scipy(mock_no_scipy): - with pytest.raises(ModuleNotFoundError, match=r"This module requires scipy to be installed."): FID() @@ -34,7 +33,6 @@ def mock_no_numpy(): def test_no_numpy(mock_no_numpy): - with pytest.raises(ModuleNotFoundError, match=r"This module requires numpy to be installed."): FID() @@ -105,7 +103,6 @@ def test_compute_fid_sqrtm(): def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero"): FID(num_features=-1, feature_extractor=torch.nn.Identity()) @@ -156,7 +153,6 @@ def test_statistics(): def _test_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() @@ -218,7 +214,6 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() diff --git a/tests/ignite/metrics/gan/test_inception_score.py b/tests/ignite/metrics/gan/test_inception_score.py index 98e2f7ecd871..cb25ae3608d2 100644 --- a/tests/ignite/metrics/gan/test_inception_score.py +++ b/tests/ignite/metrics/gan/test_inception_score.py @@ -9,7 +9,6 @@ def calculate_inception_score(p_yx): - p_y = torch.unsqueeze(p_yx.mean(axis=0), 0) kl_d = torch.kl_div(torch.log(p_y), p_yx) @@ -22,7 +21,6 @@ def calculate_inception_score(p_yx): def test_inception_score(): - p_yx = torch.rand(20, 10) m = InceptionScore(num_features=10, feature_extractor=torch.nn.Identity()) m.update(p_yx) @@ -43,7 +41,6 @@ def test_device_mismatch_cuda(): def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero, got:"): InceptionScore(num_features=-1, feature_extractor=torch.nn.Identity()).update(torch.rand(2, 0)) @@ -66,7 +63,6 @@ def test_wrong_inputs(): def _test_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() @@ -119,7 +115,6 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() diff --git a/tests/ignite/metrics/gan/test_utils.py b/tests/ignite/metrics/gan/test_utils.py index 50871acb6a79..534865fdc094 100644 --- a/tests/ignite/metrics/gan/test_utils.py +++ b/tests/ignite/metrics/gan/test_utils.py @@ -34,7 +34,6 @@ def update(self, output): def test_dummy_metric(): - with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero, got:"): DummyInceptionMetric(num_features=-1, feature_extractor=torch.nn.Identity()).update(torch.rand(2, 0)) @@ -57,7 +56,6 @@ def test_dummy_metric(): def test_inception_extractor_wrong_inputs(): - with pytest.raises(ValueError, match=r"Inputs should be a tensor of dim 4"): InceptionModel(return_features=True)(torch.rand(2)) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index ff173ed66d0b..9de9c6de78c5 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -16,7 +16,6 @@ def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): Bleu(ngram=0) @@ -200,7 +199,6 @@ def test_n_gram_counter(candidates, references): def _test_macro_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() @@ -243,7 +241,6 @@ def _test(metric_device): def _test_micro_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() @@ -293,7 +290,6 @@ def _test(metric_device): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_macro_distrib_integration(device) _test_micro_distrib_integration(device) @@ -302,7 +298,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_macro_distrib_integration(device) _test_micro_distrib_integration(device) @@ -312,7 +307,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -324,7 +318,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_macro_distrib_integration(device) _test_micro_distrib_integration(device) @@ -334,7 +327,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_macro_distrib_integration(device) _test_micro_distrib_integration(device) diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index b798c5e3bbe5..c2fb75051829 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -38,7 +38,6 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"ngram order must be greater than zero"): RougeN(ngram=0) @@ -120,7 +119,6 @@ def test_rouge_metrics(candidates, references): def _test_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() @@ -175,7 +173,6 @@ def _test(metric_device): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) @@ -183,7 +180,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -192,7 +188,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -203,7 +198,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -212,7 +206,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py index ce26cd3dc6ec..d4551721ee0e 100644 --- a/tests/ignite/metrics/test_accumulation.py +++ b/tests/ignite/metrics/test_accumulation.py @@ -15,7 +15,6 @@ def test_variable_accumulation_wrong_inputs(): - with pytest.raises(TypeError, match=r"Argument op should be a callable"): VariableAccumulation(1) @@ -29,7 +28,6 @@ def test_variable_accumulation_wrong_inputs(): def test_variable_accumulation_mean_variable(): - mean_var = VariableAccumulation(lambda a, x: a + x) y_true = torch.rand(100) @@ -61,7 +59,6 @@ def test_variable_accumulation_mean_variable(): def test_average(): - with pytest.raises(NotComputableError): v = Average() v.compute() @@ -102,7 +99,6 @@ def _mean(y_true): def test_geom_average(): - with pytest.raises(NotComputableError): v = GeometricAverage() v.compute() @@ -136,13 +132,11 @@ def test_geom_average(): @pytest.mark.parametrize("metric_cls, true_result_fn", [(Average, _mean), (GeometricAverage, _geom_mean)]) @pytest.mark.parametrize("shape", [[100, 12], [100]]) def test_integration(metric_cls, true_result_fn, shape): - assert len(shape) > 0 and len(shape) < 3 custom_variable = 10.0 + 5.0 * torch.rand(shape) def update_fn(engine, batch): - output = custom_variable[engine.state.iteration - 1] output = output.item() if output.ndimension() < 1 else output return 0, output @@ -158,6 +152,16 @@ def update_fn(engine, batch): np.array(state.metrics["agg_custom_var"]), true_result_fn(custom_variable), decimal=5 ) + metric_state = custom_var_mean.state_dict() + saved_num_examples = custom_var_mean.num_examples + saved_accumulator = custom_var_mean.accumulator + custom_var_mean.reset() + assert custom_var_mean.num_examples == 0 + assert custom_var_mean.accumulator == 0 + custom_var_mean.load_state_dict(metric_state) + assert custom_var_mean.num_examples == saved_num_examples + assert (custom_var_mean.accumulator == saved_accumulator).all() + def test_compute_mean_std(): n = 8 @@ -174,7 +178,7 @@ def compute_mean_std(engine, batch): _b, _c = batch.shape[:2] data = batch.reshape(_b, _c, -1).to(dtype=torch.float64) _mean = torch.mean(data, dim=-1) - _mean2 = torch.mean(data ** 2, dim=-1) + _mean2 = torch.mean(data**2, dim=-1) return {"mean": _mean, "mean^2": _mean2} compute_engine = Engine(compute_mean_std) @@ -327,7 +331,6 @@ def _dist_geom_mean(y_true): def _test_distrib_integration(device): def _test(metric_cls, shape, true_result_fn, metric_device, tol=1e-5): - size = 100 custom_variable = 10.0 + 5.0 * torch.rand(size, *shape, dtype=torch.float64) custom_variable = custom_variable.to(device) @@ -373,12 +376,10 @@ def update_fn(engine, batch): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - m = VariableAccumulation(lambda a, x: x, device=metric_device) assert m._device == metric_device assert ( @@ -423,7 +424,6 @@ def _test_apex_average(device, amp_mode, opt_level): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) @@ -435,7 +435,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) @@ -448,7 +447,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = idist.device() nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -463,7 +461,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) @@ -505,7 +502,6 @@ def test_apex_average_on_cuda(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) @@ -518,7 +514,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py index de827b4b0733..0b7fee3bf605 100644 --- a/tests/ignite/metrics/test_accuracy.py +++ b/tests/ignite/metrics/test_accuracy.py @@ -362,7 +362,6 @@ def _test(metric_device): def _test_distrib_integration_multiclass(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -406,6 +405,14 @@ def update(engine, i): assert pytest.approx(res) == true_res + metric_state = acc.state_dict() + saved__num_correct = acc._num_correct + saved__num_examples = acc._num_examples + acc.reset() + acc.load_state_dict(metric_state) + assert acc._num_examples == saved__num_examples + assert (acc._num_correct == saved__num_correct).all() + metric_devices = ["cpu"] if device.type != "xla": metric_devices.append(idist.device()) @@ -416,7 +423,6 @@ def update(engine, i): def _test_distrib_integration_multilabel(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -470,12 +476,10 @@ def update(engine, i): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - acc = Accuracy(device=metric_device) assert acc._device == metric_device assert ( @@ -492,7 +496,6 @@ def _test_distrib_accumulator_device(device): def _test_distrib_integration_list_of_tensors_or_numbers(device): - rank = idist.get_rank() def _test(n_epochs, metric_device): @@ -548,7 +551,6 @@ def update(_, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) @@ -560,7 +562,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) @@ -573,7 +574,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -617,7 +617,6 @@ def test_distrib_xla_nprocs(xmp_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) @@ -630,7 +629,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index aa2238824bc4..b132daf13304 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -10,11 +10,9 @@ def _test_integration_multiclass(device, output_dict): - rank = idist.get_rank() def _test(metric_device, n_classes, labels=None): - classification_report = ClassificationReport(device=metric_device, output_dict=output_dict, labels=labels) n_iters = 80 batch_size = 16 @@ -62,6 +60,22 @@ def update(engine, i): assert sklearn_result["macro avg"]["recall"] == pytest.approx(res["macro avg"]["recall"]) assert sklearn_result["macro avg"]["f1-score"] == pytest.approx(res["macro avg"]["f1-score"]) + metric_state = classification_report.state_dict() + classification_report.reset() + classification_report.load_state_dict(metric_state) + res2 = classification_report.compute() + if not output_dict: + res2 = json.loads(res2) + + for i in range(n_classes): + label_i = labels[i] if labels else str(i) + assert res2[label_i]["precision"] == res[label_i]["precision"] + assert res2[label_i]["f1-score"] == res[label_i]["f1-score"] + assert res2[label_i]["recall"] == res[label_i]["recall"] + assert res2["macro avg"]["precision"] == res["macro avg"]["precision"] + assert res2["macro avg"]["recall"] == res["macro avg"]["recall"] + assert res2["macro avg"]["f1-score"] == res["macro avg"]["f1-score"] + for i in range(5): torch.manual_seed(12 + rank + i) # check multiple random inputs as random exact occurencies are rare @@ -78,11 +92,9 @@ def update(engine, i): def _test_integration_multilabel(device, output_dict): - rank = idist.get_rank() def _test(metric_device, n_epochs, labels=None): - classification_report = ClassificationReport(device=metric_device, output_dict=output_dict, is_multilabel=True) n_iters = 10 @@ -125,7 +137,6 @@ def update(engine, i): sklearn_result = sklearn_classification_report(np_y_true, np_y_preds, output_dict=True, zero_division=1) for i in range(n_classes): - torch.manual_seed(12 + rank + i) label_i = labels[i] if labels else str(i) assert sklearn_result[str(i)]["precision"] == pytest.approx(res[label_i]["precision"]) assert sklearn_result[str(i)]["f1-score"] == pytest.approx(res[label_i]["f1-score"]) @@ -134,7 +145,8 @@ def update(engine, i): assert sklearn_result["macro avg"]["recall"] == pytest.approx(res["macro avg"]["recall"]) assert sklearn_result["macro avg"]["f1-score"] == pytest.approx(res["macro avg"]["f1-score"]) - for _ in range(3): + for i in range(3): + torch.manual_seed(12 + rank + i) # check multiple random inputs as random exact occurencies are rare metric_devices = ["cpu"] if device.type != "xla": @@ -150,7 +162,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) @@ -161,7 +172,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_gloo): - device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) @@ -173,7 +183,6 @@ def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_glo @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -184,7 +193,6 @@ def test_distrib_hvd(gloo_hvd_executor): def _test_distrib_xla_nprocs(index): - device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) @@ -212,7 +220,6 @@ def to_numpy_multilabel(y): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) @@ -224,7 +231,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py index 2b777126fdab..ddb4509567d7 100644 --- a/tests/ignite/metrics/test_confusion_matrix.py +++ b/tests/ignite/metrics/test_confusion_matrix.py @@ -182,7 +182,6 @@ def test_multiclass_images(): def test_iou_wrong_input(): - with pytest.raises(TypeError, match="Argument cm should be instance of ConfusionMatrix"): IoU(None) @@ -202,7 +201,6 @@ def test_iou_wrong_input(): @pytest.mark.parametrize("average", [None, "samples"]) def test_iou(average): - y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -241,7 +239,6 @@ def test_iou(average): def test_miou(): - y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -278,7 +275,6 @@ def test_miou(): def test_cm_accuracy(): - y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -297,7 +293,6 @@ def test_cm_accuracy(): def test_cm_precision(): - y_true, y_pred = np.random.randint(0, 10, size=(1000,)), np.random.randint(0, 10, size=(1000,)) th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -328,7 +323,6 @@ def test_cm_precision(): def test_cm_recall(): - y_true, y_pred = np.random.randint(0, 10, size=(1000,)), np.random.randint(0, 10, size=(1000,)) th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -393,7 +387,6 @@ def test_cm_with_average(): def test_dice_coefficient_wrong_input(): - with pytest.raises(TypeError, match="Argument cm should be instance of ConfusionMatrix"): DiceCoefficient(None) @@ -412,7 +405,6 @@ def test_dice_coefficient_wrong_input(): def test_dice_coefficient(): - y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -517,12 +509,10 @@ def _test(metric_device): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - cm = ConfusionMatrix(num_classes=3, device=metric_device) assert cm._device == metric_device assert ( @@ -540,7 +530,6 @@ def _test_distrib_accumulator_device(device): @pytest.mark.parametrize("average", [None, "samples"]) def test_jaccard_index(average): - y_true, y_pred = get_y_true_y_pred() th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred) @@ -582,7 +571,6 @@ def test_jaccard_index(average): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @@ -591,7 +579,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @@ -601,7 +588,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -636,7 +622,6 @@ def test_distrib_xla_nprocs(xmp_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @@ -646,7 +631,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_epoch_metric.py b/tests/ignite/metrics/test_epoch_metric.py index dc4d6b4cd6c0..d82168266b1d 100644 --- a/tests/ignite/metrics/test_epoch_metric.py +++ b/tests/ignite/metrics/test_epoch_metric.py @@ -8,7 +8,6 @@ def test_epoch_metric_wrong_setup_or_input(): - # Wrong compute function with pytest.raises(TypeError, match=r"Argument compute_fn should be callable."): EpochMetric(12345) @@ -152,7 +151,6 @@ def compute_fn(y_preds, y_targets): def test_distrib_integration(distributed): - device = idist.device() if idist.device().type != "xla" else "cpu" rank = idist.get_rank() torch.manual_seed(40 + rank) diff --git a/tests/ignite/metrics/test_fbeta.py b/tests/ignite/metrics/test_fbeta.py index c6301470d53c..62a793f4f53c 100644 --- a/tests/ignite/metrics/test_fbeta.py +++ b/tests/ignite/metrics/test_fbeta.py @@ -13,7 +13,6 @@ def test_wrong_inputs(): - with pytest.raises(ValueError, match=r"Beta should be a positive integer"): Fbeta(0.0) @@ -50,7 +49,6 @@ def _output_transform(output): ], ) def test_integration(p, r, average, output_transform): - np.random.seed(1) n_iters = 10 @@ -89,7 +87,6 @@ def update_fn(engine, batch): def _test_distrib_integration(device): - rank = idist.get_rank() def _test(p, r, average, n_epochs, metric_device): @@ -149,7 +146,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) @@ -157,7 +153,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -166,7 +161,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -198,7 +192,6 @@ def test_distrib_xla_nprocs(xmp_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -207,6 +200,5 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_frequency.py b/tests/ignite/metrics/test_frequency.py index f1d0d2666bf2..ee053997ebf6 100644 --- a/tests/ignite/metrics/test_frequency.py +++ b/tests/ignite/metrics/test_frequency.py @@ -28,7 +28,6 @@ def test_nondistributed_average(): def _test_frequency_with_engine(workers=None, lower_bound_factor=0.8, upper_bound_factor=1.1, every=1): - if workers is None: workers = idist.get_world_size() @@ -90,7 +89,6 @@ def test_frequency_with_engine_distributed_with_every(distributed_context_single @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_frequency_with_engine, (None, 0.8, 1), np=nproc, do_init=True) diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index cc369371c10c..19cc68cd45cc 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -26,7 +26,6 @@ def compute(self): pass def update(self, output): - assert output == self.true_output @@ -176,7 +175,6 @@ def _test(metric_device, y_test_1, y_test_2): def _test_distrib_accumulator_device(device, y_test_1): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) @@ -208,7 +206,6 @@ def test_sum_detached(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -217,7 +214,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -227,7 +223,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -239,14 +234,12 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) def _test_distrib_xla_nprocs(index): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -264,7 +257,6 @@ def test_distrib_xla_nprocs(xmp_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6) _test_distrib_accumulator_device(device, y_test_1()) @@ -274,7 +266,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py index 6ab3c1b9080c..ab9b0da8810f 100644 --- a/tests/ignite/metrics/test_mean_absolute_error.py +++ b/tests/ignite/metrics/test_mean_absolute_error.py @@ -19,7 +19,6 @@ def test_no_update(): @pytest.fixture(params=[item for item in range(4)]) def test_case(request): - return [ (torch.randint(0, 10, size=(100, 1)), torch.randint(0, 10, size=(100, 1)), 1), (torch.randint(-10, 10, size=(100, 5)), torch.randint(-10, 10, size=(100, 5)), 1), @@ -31,7 +30,6 @@ def test_case(request): @pytest.mark.parametrize("n_times", range(5)) def test_compute(n_times, test_case): - mae = MeanAbsoluteError() y_pred, y, batch_size = test_case @@ -61,7 +59,6 @@ def _test_distrib_integration(device): rank = idist.get_rank() def _test(metric_device): - n_iters = 80 batch_size = 50 torch.manual_seed(12 + rank) @@ -99,7 +96,6 @@ def update(engine, i): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) @@ -131,7 +127,6 @@ def test_accumulator_detached(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -140,7 +135,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -150,7 +144,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -162,7 +155,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -172,7 +164,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -182,7 +173,6 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py index 5aeeb537609f..0a53f48193ea 100644 --- a/tests/ignite/metrics/test_mean_pairwise_distance.py +++ b/tests/ignite/metrics/test_mean_pairwise_distance.py @@ -19,7 +19,6 @@ def test_zero_sample(): @pytest.fixture(params=[item for item in range(4)]) def test_case(request): - return [ (torch.randint(0, 10, size=(100, 1)), torch.randint(0, 10, size=(100, 1)), 1), (torch.randint(-20, 20, size=(100, 5)), torch.randint(-20, 20, size=(100, 5)), 1), @@ -31,7 +30,6 @@ def test_case(request): @pytest.mark.parametrize("n_times", range(5)) def test_compute(n_times, test_case): - mpd = MeanPairwiseDistance() y_pred, y, batch_size = test_case @@ -52,14 +50,12 @@ def test_compute(n_times, test_case): def _test_distrib_integration(device): - from ignite.engine import Engine rank = idist.get_rank() torch.manual_seed(12 + rank) def _test(metric_device): - n_iters = 100 batch_size = 50 @@ -109,12 +105,10 @@ def update(engine, i): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - mpd = MeanPairwiseDistance(device=metric_device) for dev in [mpd._device, mpd._sum_of_distances.device]: assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}" @@ -141,7 +135,6 @@ def test_accumulator_detached(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -150,7 +143,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -160,7 +152,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -172,7 +163,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -182,7 +172,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py index d1dd17625584..7bf60889436c 100644 --- a/tests/ignite/metrics/test_mean_squared_error.py +++ b/tests/ignite/metrics/test_mean_squared_error.py @@ -30,7 +30,6 @@ def test_case(request): @pytest.mark.parametrize("n_times", range(5)) def test_compute(n_times, test_case): - mse = MeanSquaredError() y_pred, y, batch_size = test_case @@ -54,7 +53,6 @@ def test_compute(n_times, test_case): def _test_distrib_integration(device, tol=1e-6): - from ignite.engine import Engine rank = idist.get_rank() @@ -97,12 +95,10 @@ def update(engine, i): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - device = torch.device(device) mse = MeanSquaredError(device=metric_device) @@ -131,7 +127,6 @@ def test_accumulator_detached(): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -140,7 +135,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -150,7 +144,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -162,7 +155,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -172,7 +164,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py index 10d7a715bdd8..b0ffc1df3a2d 100644 --- a/tests/ignite/metrics/test_metric.py +++ b/tests/ignite/metrics/test_metric.py @@ -1,5 +1,6 @@ import numbers import os +from typing import Dict, List from unittest.mock import MagicMock import numpy as np @@ -10,8 +11,19 @@ import ignite.distributed as idist from ignite.engine import Engine, Events, State -from ignite.metrics import ConfusionMatrix, Precision, Recall -from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, reinit__is_reduced, sync_all_reduce +from ignite.metrics import Accuracy, ConfusionMatrix, Precision, Recall +from ignite.metrics.metric import ( + BatchFiltered, + BatchWise, + EpochWise, + Metric, + reinit__is_reduced, + RunningBatchWise, + RunningEpochWise, + SingleEpochRunningBatchWise, + sync_all_reduce, +) +from ignite.utils import _tree_map class DummyMetric1(Metric): @@ -190,19 +202,19 @@ def compute(self): assert m2_times_2.compute() == 200 # __pow__ - m0_pow_m1 = m0 ** m1 + m0_pow_m1 = m0**m1 m0.update([1, 10, 100]) m1.update([1, 10, 100]) assert m0_pow_m1.compute() == 1 m0.update([2, 20, 200]) m1.update([2, 20, 200]) - assert m0_pow_m1.compute() == 2 ** 20 + assert m0_pow_m1.compute() == 2**20 - m2_pow_2 = m2 ** 2 + m2_pow_2 = m2**2 m2.update([1, 10, 100]) assert m2_pow_2.compute() == 10000 - m2_pow_2 = 0.99 ** m2 + m2_pow_2 = 0.99**m2 m2.update([1, 10, 100]) assert m2_pow_2.compute() == 0.3660323412732292 @@ -404,7 +416,6 @@ def test_abstract_class(): def test_pytorch_operators(): def _test(composed_metric, metric_name, compute_true_value_fn): - metrics = { metric_name: composed_metric, } @@ -700,28 +711,30 @@ def _test_creating_on_xla_fails(device): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) _test_compute_with_sync_all_reduce_doesnt_change_attributes(device) + test_state_dict() + test_load_state_dict() + @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) _test_compute_with_sync_all_reduce_doesnt_change_attributes(device) + test_state_dict() + test_load_state_dict() @pytest.mark.distributed @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = "cpu" if not torch.cuda.is_available() else "cuda" nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -734,7 +747,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -745,7 +757,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -818,7 +829,6 @@ def update(self, output): @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU") def test_completed_on_cuda(): - # Checks https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919 class DummyMetric(Metric): @@ -846,80 +856,133 @@ def test_usage_exception(): m = DummyMetric2() with pytest.raises(TypeError, match=r"Unhandled usage type"): m.attach(engine, "dummy", usage=1) - with pytest.raises(ValueError, match=r"usage should be 'EpochWise.usage_name' or 'BatchWise.usage_name'"): + with pytest.raises( + ValueError, + match=r"usage should be '\(Running\)EpochWise.usage_name' or '\(\(SingleEpoch\)Running\)BatchWise.usage_name'", + ): m.attach(engine, "dummy", usage="fake") -def test_epochwise_usage(): - class MyMetric(Metric): - def __init__(self): - super(MyMetric, self).__init__() - self.value = [] +class DummyAccumulateInListMetric(Metric): + def __init__(self): + super(DummyAccumulateInListMetric, self).__init__() + self.value = [] - def reset(self): - self.value = [] + def reset(self): + self.value = [] - def compute(self): - return self.value + def compute(self): + return self.value - def update(self, output): - self.value.append(output) + def update(self, output): + self.value.append(output) - def test(usage): - engine = Engine(lambda e, b: b) - m = MyMetric() +@pytest.mark.parametrize("usage", ["epoch_wise", EpochWise.usage_name, EpochWise()]) +def test_epochwise_usage(usage): + engine = Engine(lambda e, b: b) - m.attach(engine, "ewm", usage=usage) + m = DummyAccumulateInListMetric() - @engine.on(Events.EPOCH_COMPLETED) - def _(): - ewm = engine.state.metrics["ewm"] - assert len(ewm) == 3 - assert ewm == [0, 1, 2] + m.attach(engine, "ewm", usage=usage) - engine.run([0, 1, 2], max_epochs=10) - m.detach(engine, usage=usage) + @engine.on(Events.EPOCH_COMPLETED) + def _(): + ewm = engine.state.metrics["ewm"] + assert len(ewm) == 3 + assert ewm == [0, 1, 2] - test("epoch_wise") - test(EpochWise.usage_name) - test(EpochWise()) + engine.run([0, 1, 2], max_epochs=10) + m.detach(engine, usage=usage) -def test_batchwise_usage(): - class MyMetric(Metric): - def __init__(self): - super(MyMetric, self).__init__() - self.value = [] +class DummyAccumulateMetric(Metric): + def __init__(self): + super(DummyAccumulateMetric, self).__init__() + self.value = 0 - def reset(self): - self.value = [] + def reset(self): + self.value = 0 - def compute(self): - return self.value + def compute(self): + return self.value - def update(self, output): - self.value.append(output) + def update(self, output): + self.value += output + + +@pytest.mark.parametrize("usage", ["running_epoch_wise", RunningEpochWise.usage_name, RunningEpochWise()]) +def test_running_epochwise_usage(usage): + engine = Engine(lambda e, b: e.state.metrics["ewm"]) + + engine.state.metrics["ewm"] = 0 + + @engine.on(Events.EPOCH_STARTED) + def _(): + engine.state.metrics["ewm"] += 1 - def test(usage): - engine = Engine(lambda e, b: b) + m = DummyAccumulateMetric() + m.attach(engine, "rewm", usage=usage) - m = MyMetric() + @engine.on(Events.EPOCH_COMPLETED) + def _(): + assert engine.state.metrics["rewm"] == sum(range(engine.state.epoch + 1)) - m.attach(engine, "bwm", usage=usage) + engine.run([0, 1, 2], max_epochs=10) - @engine.on(Events.ITERATION_COMPLETED) - def _(): - bwm = engine.state.metrics["bwm"] - assert len(bwm) == 1 - assert bwm[0] == (engine.state.iteration - 1) % 3 + m.detach(engine, usage=usage) - engine.run([0, 1, 2], max_epochs=10) - m.detach(engine, usage=usage) - test("batch_wise") - test(BatchWise.usage_name) - test(BatchWise()) +@pytest.mark.parametrize("usage", ["batch_wise", BatchWise.usage_name, BatchWise()]) +def test_batchwise_usage(usage): + engine = Engine(lambda e, b: b) + + m = DummyAccumulateInListMetric() + + m.attach(engine, "bwm", usage=usage) + + @engine.on(Events.ITERATION_COMPLETED) + def _(): + bwm = engine.state.metrics["bwm"] + assert len(bwm) == 1 + assert bwm[0] == (engine.state.iteration - 1) % 3 + + engine.run([0, 1, 2], max_epochs=10) + m.detach(engine, usage=usage) + + +@pytest.mark.parametrize("usage", ["running_batch_wise", RunningBatchWise.usage_name, RunningBatchWise()]) +def test_running_batchwise_usage(usage): + engine = Engine(lambda e, b: b) + + m = DummyAccumulateMetric() + m.attach(engine, "rbwm", usage=usage) + + @engine.on(Events.EPOCH_COMPLETED) + def _(): + assert engine.state.metrics["rbwm"] == 6 * engine.state.epoch + + engine.run([0, 1, 2, 3], max_epochs=10) + + m.detach(engine, usage=usage) + + +@pytest.mark.parametrize( + "usage", ["single_epoch_running_batch_wise", SingleEpochRunningBatchWise.usage_name, SingleEpochRunningBatchWise()] +) +def test_single_epoch_running_batchwise_usage(usage): + engine = Engine(lambda e, b: b) + + m = DummyAccumulateMetric() + m.attach(engine, "rbwm", usage=usage) + + @engine.on(Events.EPOCH_COMPLETED) + def _(): + assert engine.state.metrics["rbwm"] == 6 + + engine.run([0, 1, 2, 3], max_epochs=10) + + m.detach(engine, usage=usage) def test_batchfiltered_usage(): @@ -1069,3 +1132,284 @@ def update(self, output): with pytest.raises(ValueError, match=r"Output should have 2 items of the same length"): engine.run([0] * 10) + + +class DummyMetric4(Metric): + _state_dict_all_req_keys = ( + "dnumber", + "fnumber", + "tensor", + "tensor2", + "metric", + "metric_dict", + "metric_list", + "initially_none", + ) + + @staticmethod + def gen_expected_state(value): + expected_state = { + "dnumber": value + 1, + "fnumber": value + 2.234, + "tensor": torch.tensor(value + 2.5), + "tensor2": torch.tensor(value + 3.5), + "metric": { + "_num_correct": torch.tensor(value + 3), + "_num_examples": value + 4, + }, + "metric_dict": { + "m1": { + "_num_correct": torch.tensor(value + 5), + "_num_examples": value + 6, + }, + "m2": { + "_numerator": torch.tensor([value + 7, value + 8]), + "_denominator": torch.tensor([value + 9, value + 10]), + "_weight": value, + "_updated": True, + }, + "n": value + 12, + }, + "metric_list": [ + { + "_numerator": torch.tensor([value + 11, value + 12]), + "_denominator": torch.tensor([value + 13, value + 14]), + "_weight": value, + "_updated": True, + }, + { + "_numerator": torch.tensor([value + 15, value + 16]), + "_denominator": torch.tensor([value + 17, value + 18]), + "_weight": value, + "_updated": True, + }, + value + 234, + ], + "initially_none": None, + } + return expected_state + + def __init__(self, value): + super().reset() + + self.expected_state = DummyMetric4.gen_expected_state(value) + + self.dnumber = self.expected_state["dnumber"] + self.fnumber = self.expected_state["fnumber"] + self.tensor = self.expected_state["tensor"] + self.tensor2 = self.expected_state["tensor2"] + + self.metric = Accuracy() + self.metric._num_correct = self.expected_state["metric"]["_num_correct"] + self.metric._num_examples = self.expected_state["metric"]["_num_examples"] + + self.metric_dict: Dict[str, Metric] = { + "m1": Accuracy(), + "m2": Precision(), + "n": self.expected_state["metric_dict"]["n"], + } + self.metric_dict["m1"]._num_correct = self.expected_state["metric_dict"]["m1"]["_num_correct"] + self.metric_dict["m1"]._num_examples = self.expected_state["metric_dict"]["m1"]["_num_examples"] + self.metric_dict["m2"]._numerator = self.expected_state["metric_dict"]["m2"]["_numerator"] + self.metric_dict["m2"]._denominator = self.expected_state["metric_dict"]["m2"]["_denominator"] + self.metric_dict["m2"]._weight = self.expected_state["metric_dict"]["m2"]["_weight"] + self.metric_dict["m2"]._updated = self.expected_state["metric_dict"]["m2"]["_updated"] + + self.metric_list: List[Metric] = [ + Recall(), + Precision(), + self.expected_state["metric_list"][2], + ] + self.metric_list[0]._numerator = self.expected_state["metric_list"][0]["_numerator"] + self.metric_list[0]._denominator = self.expected_state["metric_list"][0]["_denominator"] + self.metric_list[0]._weight = self.expected_state["metric_list"][0]["_weight"] + self.metric_list[0]._updated = self.expected_state["metric_list"][0]["_updated"] + + self.metric_list[1]._numerator = self.expected_state["metric_list"][1]["_numerator"] + self.metric_list[1]._denominator = self.expected_state["metric_list"][1]["_denominator"] + self.metric_list[1]._weight = self.expected_state["metric_list"][1]["_weight"] + self.metric_list[1]._updated = self.expected_state["metric_list"][1]["_updated"] + + self.initially_none = None + + def reset(self): + self.dnumber = -1 + self.fnumber = -2.0 + self.tensor = torch.tensor([-3]) + self.tensor2 = 0 + self.metric.reset() + for m in self.metric_dict.values(): + if isinstance(m, Metric): + m.reset() + for m in self.metric_list: + if isinstance(m, Metric): + m.reset() + self.initially_none = None + + def update(self, output): + pass + + def compute(self): + pass + + +def test_wrong_state_dict(): + class WrongMetric(Metric): + _state_dict_all_req_keys = ("object",) + + def __init__(self, value): + super().__init__() + self.object = value + + def reset(self): + pass + + def update(self, output): + pass + + def compute(self): + pass + + metric = WrongMetric(object()) + with pytest.raises(TypeError, match="Found attribute of unsupported type. Currently, supported types include"): + metric.state_dict() + + delattr(metric, "object") + with pytest.raises(ValueError, match="Found a value in _state_dict_all_req_keys that is not among"): + metric.state_dict() + + +def test_wrong_load_state_dict(): + metric = DummyMetric4(1) + + with pytest.raises(TypeError, match="Argument state_dict should be a dictionary"): + metric.load_state_dict(123) + + with pytest.raises(ValueError, match="Incorrect state_dict object. Argument state_dict should be a dictionary"): + metric.load_state_dict({"abc": 123}) + + with pytest.raises(ValueError, match="Expected a list of state_dicts of size equal world_size"): + metric.load_state_dict({Metric._Metric__state_dict_key_per_rank: []}) + + +# @pytest.mark.distributed +# @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +# @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +# def test_distrib_state_dict_metric_in_metric(distributed_context_single_node_nccl): +# class _TestMetric(Metric): +# _state_dict_all_req_keys = ("metric", ) +# def __init__(self): +# self.metric = Accuracy() + +# def reset(self): +# self.metric.reset() + +# def update(self, output): +# self.metric.update(output) + +# def compute(self): +# return self.metric.compute() + +# m = _TestMetric() +# m.update(( +# torch.rand(4, 10), +# torch.randint(0, 10, size=(4, )), +# )) + +# rank = idist.get_rank() + +# import time +# time.sleep(rank * 0.1) + +# print("m: ", m.state_dict()) +# assert False + + +def test_state_dict(): + metric = DummyMetric4(1) + state = metric.state_dict() + + assert isinstance(state, dict) and len(state) == 1 and Metric._Metric__state_dict_key_per_rank in state + + rank = idist.get_rank() + ws = idist.get_world_size() + + list_state_dicts = state[Metric._Metric__state_dict_key_per_rank] + assert len(list_state_dicts) == ws + + state = list_state_dicts[rank] + expected_state = metric.expected_state + assert state.keys() == expected_state.keys() + + # Flatten expected state and output state and compare values + output_flatten = [] + expected_flatten = [] + + def get_func(flatten): + def wrapper(x, key): + if isinstance(x, Metric): + flatten.extend([(key, getattr(x, k)) for k in x._state_dict_all_req_keys]) + else: + flatten.append((key, x)) + + return wrapper + + _tree_map(get_func(expected_flatten), expected_state) + _tree_map(get_func(output_flatten), state) + + assert len(output_flatten) == len(expected_flatten) and len(expected_flatten) > 0, ( + expected_flatten, + output_flatten, + ) + + for key_output, key_expected in zip(output_flatten, expected_flatten): + key1, output = key_output + key2, expected = key_expected + assert key1 == key2, (key1, key2) + if isinstance(output, torch.Tensor): + assert isinstance(expected, torch.Tensor) + assert (output == expected).all(), (output, expected) + else: + assert output == expected, (output, expected) + + +def test_load_state_dict(): + metric = DummyMetric4(1) + state = metric.state_dict() + + metric.reset() + metric.initially_none = 1 + metric.load_state_dict(state) + + rank = idist.get_rank() + world_size = idist.get_world_size() + assert len(state[Metric._Metric__state_dict_key_per_rank]) == world_size + expected_state = state[Metric._Metric__state_dict_key_per_rank][rank] + + # Flatten expected state and output state and compare values + output_flatten = [] + expected_flatten = [] + + def get_func(flatten): + def wrapper(x, **kwargs): + if isinstance(x, Metric): + flatten.extend([getattr(x, k) for k in x._state_dict_all_req_keys]) + else: + flatten.append(x) + + return wrapper + + _tree_map(get_func(expected_flatten), expected_state) + _tree_map(get_func(output_flatten), {key: getattr(metric, key) for key in metric._state_dict_all_req_keys}) + + assert len(output_flatten) == len(expected_flatten) and len(expected_flatten) > 0, ( + expected_flatten, + output_flatten, + ) + + for output, expected in zip(output_flatten, expected_flatten): + if isinstance(output, torch.Tensor): + assert isinstance(expected, torch.Tensor) + assert (output == expected).all(), (output, expected) + else: + assert output == expected, (output, expected) diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py index b7a73158c4c3..af142d6ec82a 100644 --- a/tests/ignite/metrics/test_metrics_lambda.py +++ b/tests/ignite/metrics/test_metrics_lambda.py @@ -8,7 +8,7 @@ import ignite.distributed as idist from ignite.engine import Engine -from ignite.metrics import Metric, MetricsLambda, Precision, Recall +from ignite.metrics import Accuracy, Metric, MetricsLambda, Precision, Recall class ListGatherMetric(Metric): @@ -100,7 +100,6 @@ def fn(x, y, z, t): def test_metrics_lambda_update_and_attach_together(): - y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() @@ -114,7 +113,7 @@ def update_fn(engine, batch): recall = Recall(average=False) def Fbeta(r, p, beta): - return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() + return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) @@ -138,7 +137,6 @@ def Fbeta(r, p, beta): def test_metrics_lambda_update(): - """ Test if the underlying metrics are updated """ @@ -149,7 +147,7 @@ def test_metrics_lambda_update(): recall = Recall(average=False) def Fbeta(r, p, beta): - return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() + return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) @@ -248,7 +246,7 @@ def update_fn(engine, batch): recall = Recall(average=False) def Fbeta(r, p, beta): - return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() + return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) @@ -272,9 +270,36 @@ def Fbeta(r, p, beta): assert precision_true == approx(precision), f"{precision_true} vs {precision}" assert recall_true == approx(recall), f"{recall_true} vs {recall}" + metric_state = F1.state_dict() + F1.reset() + F1.load_state_dict(metric_state) + f1_value = F1.compute() + assert f1_value == state.metrics["f1"] + + +def test_load_state_dict(): + acc = Accuracy() + error = 1.0 - acc + + acc.update( + ( + torch.randint(0, 2, size=(8,)), + torch.randint(0, 2, size=(8,)), + ) + ) + + e = error.compute() + a = acc.compute() + assert 1.0 - a == e + + metric_state = error.state_dict() + error.reset() + error.load_state_dict(metric_state) + e2 = error.compute() + assert e2 == e -def test_state_metrics(): +def test_state_metrics(): y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() @@ -304,7 +329,6 @@ def data(y_pred, y): def test_state_metrics_ingredients_not_attached(): - y_pred = torch.randint(0, 2, size=(15, 10, 4)).float() y = torch.randint(0, 2, size=(15, 10, 4)).long() @@ -333,7 +357,6 @@ def data(y_pred, y): def test_recursive_attachment(): def _test(composed_metric, metric_name, compute_true_value_fn): - metrics = { metric_name: composed_metric, } @@ -397,7 +420,6 @@ def compute_true_somemetric(y_pred, y): def _test_distrib_integration(device): - rank = idist.get_rank() n_iters = 10 @@ -425,7 +447,7 @@ def update_fn(engine, i): recall = Recall(average=False, device=metric_device) def Fbeta(r, p, beta): - return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() + return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) F1.attach(evaluator, "f1") @@ -474,7 +496,7 @@ def update(engine, i): recall = Recall(average=False, device=device) def Fbeta(r, p, beta): - return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item() + return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item() F1 = MetricsLambda(Fbeta, recall, precision, 1) F1.attach(evaluator, "f1") @@ -499,7 +521,6 @@ def Fbeta(r, p, beta): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) @@ -508,7 +529,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -517,7 +537,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -529,7 +548,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -538,7 +556,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) diff --git a/tests/ignite/metrics/test_multilabel_confusion_matrix.py b/tests/ignite/metrics/test_multilabel_confusion_matrix.py index 01c959332fb8..64893768cc4d 100644 --- a/tests/ignite/metrics/test_multilabel_confusion_matrix.py +++ b/tests/ignite/metrics/test_multilabel_confusion_matrix.py @@ -190,12 +190,10 @@ def _test(metric_device): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - cm = MultiLabelConfusionMatrix(num_classes=3, device=metric_device) assert cm._device == metric_device assert ( @@ -233,7 +231,6 @@ def test_simple_2D_input(): def test_simple_ND_input(): - num_iters = 5 num_samples = 100 num_classes = 10 @@ -279,7 +276,6 @@ def test_simple_ND_input(): def test_simple_batched(): - num_iters = 5 num_samples = 100 num_classes = 10 diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index f031ff16b1c7..bde62649e4ef 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -102,7 +102,6 @@ def ignite_average_to_scikit_average(average, data_type: str): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"]) def test_binary_input(average): - pr = Precision(average=average) assert pr._updated is False @@ -131,7 +130,6 @@ def _test(y_pred, y, batch_size): ) == pytest.approx(pr_compute) def get_test_cases(): - test_cases = [ # Binary accuracy on input of shape (N, 1) or (N, ) (torch.randint(0, 2, size=(10,)), torch.randint(0, 2, size=(10,)), 1), @@ -224,7 +222,6 @@ def test_multiclass_wrong_inputs(): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"]) def test_multiclass_input(average): - pr = Precision(average=average) assert pr._updated is False @@ -255,7 +252,6 @@ def _test(y_pred, y, batch_size): assert sk_compute == pytest.approx(pr_compute) def get_test_cases(): - test_cases = [ # Multiclass input data of shape (N, ) and (N, C) (torch.rand(10, 6), torch.randint(0, 6, size=(10,)), 1), @@ -325,7 +321,6 @@ def to_numpy_multilabel(y): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted", "samples"]) def test_multilabel_input(average): - pr = Precision(average=average, is_multilabel=True) assert pr._updated is False @@ -353,7 +348,6 @@ def _test(y_pred, y, batch_size): assert precision_score(np_y, np_y_pred, average=sk_average_parameter) == pytest.approx(pr_compute) def get_test_cases(): - test_cases = [ # Multilabel input data of shape (N, C) (torch.randint(0, 2, size=(10, 5)), torch.randint(0, 2, size=(10, 5)), 1), @@ -423,212 +417,210 @@ def test_incorrect_y_classes(average): assert pr._updated is False -def test_distrib_integration_multiclass(distributed): - from ignite.engine import Engine +@pytest.mark.usefixtures("distributed") +class TestDistributed: + def test_integration_multiclass(self): + from ignite.engine import Engine - rank = idist.get_rank() - torch.manual_seed(12) + rank = idist.get_rank() + torch.manual_seed(12) - def _test(average, n_epochs, metric_device): - n_iters = 60 - s = 16 - n_classes = 7 + def _test(average, n_epochs, metric_device): + n_iters = 60 + s = 16 + n_classes = 7 - offset = n_iters * s - y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device) - y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device) + offset = n_iters * s + y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device) + y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device) - def update(engine, i): - return ( - y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :], - y_true[i * s + rank * offset : (i + 1) * s + rank * offset], - ) + def update(engine, i): + return ( + y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :], + y_true[i * s + rank * offset : (i + 1) * s + rank * offset], + ) - engine = Engine(update) + engine = Engine(update) - pr = Precision(average=average, device=metric_device) - pr.attach(engine, "pr") - assert pr._updated is False - - data = list(range(n_iters)) - engine.run(data=data, max_epochs=n_epochs) - - assert "pr" in engine.state.metrics - assert pr._updated is True - res = engine.state.metrics["pr"] - if isinstance(res, torch.Tensor): - # Fixes https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919 - assert res.device.type == "cpu" - res = res.cpu().numpy() + pr = Precision(average=average, device=metric_device) + pr.attach(engine, "pr") + assert pr._updated is False - sk_average_parameter = ignite_average_to_scikit_average(average, "multiclass") - true_res = precision_score( - y_true.cpu().numpy(), torch.argmax(y_preds, dim=1).cpu().numpy(), average=sk_average_parameter - ) + data = list(range(n_iters)) + engine.run(data=data, max_epochs=n_epochs) - assert pytest.approx(res) == true_res + assert "pr" in engine.state.metrics + assert pr._updated is True + res = engine.state.metrics["pr"] + if isinstance(res, torch.Tensor): + # Fixes https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919 + assert res.device.type == "cpu" + res = res.cpu().numpy() - metric_devices = [torch.device("cpu")] - device = idist.device() - if device.type != "xla": - metric_devices.append(idist.device()) - for _ in range(2): - for metric_device in metric_devices: - _test(average=False, n_epochs=1, metric_device=metric_device) - _test(average=False, n_epochs=2, metric_device=metric_device) - _test(average="macro", n_epochs=1, metric_device=metric_device) - _test(average="macro", n_epochs=2, metric_device=metric_device) - _test(average="weighted", n_epochs=1, metric_device=metric_device) - _test(average="weighted", n_epochs=2, metric_device=metric_device) - _test(average="micro", n_epochs=1, metric_device=metric_device) - _test(average="micro", n_epochs=2, metric_device=metric_device) - - -def test_distrib_integration_multilabel(distributed): - - from ignite.engine import Engine - - rank = idist.get_rank() - torch.manual_seed(12) - - def _test(average, n_epochs, metric_device): - n_iters = 60 - s = 16 - n_classes = 7 - - offset = n_iters * s - y_true = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device) - y_preds = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device) - - def update(engine, i): - return ( - y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, ...], - y_true[i * s + rank * offset : (i + 1) * s + rank * offset, ...], + sk_average_parameter = ignite_average_to_scikit_average(average, "multiclass") + true_res = precision_score( + y_true.cpu().numpy(), torch.argmax(y_preds, dim=1).cpu().numpy(), average=sk_average_parameter ) - engine = Engine(update) - - pr = Precision(average=average, is_multilabel=True, device=metric_device) - pr.attach(engine, "pr") - assert pr._updated is False - - data = list(range(n_iters)) - engine.run(data=data, max_epochs=n_epochs) - - assert "pr" in engine.state.metrics - assert pr._updated is True - res = engine.state.metrics["pr"] - res2 = pr.compute() - if isinstance(res, torch.Tensor): - res = res.cpu().numpy() - res2 = res2.cpu().numpy() - assert (res == res2).all() - else: - assert res == res2 - - np_y_preds = to_numpy_multilabel(y_preds) - np_y_true = to_numpy_multilabel(y_true) - assert pr._type == "multilabel" - sk_average_parameter = ignite_average_to_scikit_average(average, "multilabel") - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=UndefinedMetricWarning) - assert precision_score(np_y_true, np_y_preds, average=sk_average_parameter) == pytest.approx(res) - - metric_devices = ["cpu"] - device = idist.device() - if device.type != "xla": - metric_devices.append(idist.device()) - for _ in range(2): - for metric_device in metric_devices: - _test(average=False, n_epochs=1, metric_device=metric_device) - _test(average=False, n_epochs=2, metric_device=metric_device) - _test(average="macro", n_epochs=1, metric_device=metric_device) - _test(average="macro", n_epochs=2, metric_device=metric_device) - _test(average="micro", n_epochs=1, metric_device=metric_device) - _test(average="micro", n_epochs=2, metric_device=metric_device) - _test(average="weighted", n_epochs=1, metric_device=metric_device) - _test(average="weighted", n_epochs=2, metric_device=metric_device) - _test(average="samples", n_epochs=1, metric_device=metric_device) - _test(average="samples", n_epochs=2, metric_device=metric_device) - - -def test_distrib_accumulator_device(distributed): - # Binary accuracy on input of shape (N, 1) or (N, ) - - def _test(average, metric_device): - pr = Precision(average=average, device=metric_device) - assert pr._device == metric_device - assert pr._updated is False - # Since the shape of the accumulated amount isn't known before the first update - # call, the internal variables aren't tensors on the right device yet. - - y_pred = torch.randint(0, 2, size=(10,)) - y = torch.randint(0, 2, size=(10,)).long() - pr.update((y_pred, y)) - - assert pr._updated is True + assert pytest.approx(res) == true_res + + metric_devices = [torch.device("cpu")] + device = idist.device() + if device.type != "xla": + metric_devices.append(idist.device()) + for _ in range(2): + for metric_device in metric_devices: + _test(average=False, n_epochs=1, metric_device=metric_device) + _test(average=False, n_epochs=2, metric_device=metric_device) + _test(average="macro", n_epochs=1, metric_device=metric_device) + _test(average="macro", n_epochs=2, metric_device=metric_device) + _test(average="weighted", n_epochs=1, metric_device=metric_device) + _test(average="weighted", n_epochs=2, metric_device=metric_device) + _test(average="micro", n_epochs=1, metric_device=metric_device) + _test(average="micro", n_epochs=2, metric_device=metric_device) + + def test_integration_multilabel(self): + from ignite.engine import Engine + + rank = idist.get_rank() + torch.manual_seed(12) + + def _test(average, n_epochs, metric_device): + n_iters = 60 + s = 16 + n_classes = 7 + + offset = n_iters * s + y_true = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device) + y_preds = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device) + + def update(engine, i): + return ( + y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, ...], + y_true[i * s + rank * offset : (i + 1) * s + rank * offset, ...], + ) + + engine = Engine(update) + + pr = Precision(average=average, is_multilabel=True, device=metric_device) + pr.attach(engine, "pr") + assert pr._updated is False + + data = list(range(n_iters)) + engine.run(data=data, max_epochs=n_epochs) + + assert "pr" in engine.state.metrics + assert pr._updated is True + res = engine.state.metrics["pr"] + res2 = pr.compute() + if isinstance(res, torch.Tensor): + res = res.cpu().numpy() + res2 = res2.cpu().numpy() + assert (res == res2).all() + else: + assert res == res2 + + np_y_preds = to_numpy_multilabel(y_preds) + np_y_true = to_numpy_multilabel(y_true) + assert pr._type == "multilabel" + sk_average_parameter = ignite_average_to_scikit_average(average, "multilabel") + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UndefinedMetricWarning) + assert precision_score(np_y_true, np_y_preds, average=sk_average_parameter) == pytest.approx(res) + + metric_devices = ["cpu"] + device = idist.device() + if device.type != "xla": + metric_devices.append(idist.device()) + for _ in range(2): + for metric_device in metric_devices: + _test(average=False, n_epochs=1, metric_device=metric_device) + _test(average=False, n_epochs=2, metric_device=metric_device) + _test(average="macro", n_epochs=1, metric_device=metric_device) + _test(average="macro", n_epochs=2, metric_device=metric_device) + _test(average="micro", n_epochs=1, metric_device=metric_device) + _test(average="micro", n_epochs=2, metric_device=metric_device) + _test(average="weighted", n_epochs=1, metric_device=metric_device) + _test(average="weighted", n_epochs=2, metric_device=metric_device) + _test(average="samples", n_epochs=1, metric_device=metric_device) + _test(average="samples", n_epochs=2, metric_device=metric_device) + + def test_accumulator_device(self): + # Binary accuracy on input of shape (N, 1) or (N, ) + + def _test(average, metric_device): + pr = Precision(average=average, device=metric_device) + assert pr._device == metric_device + assert pr._updated is False + # Since the shape of the accumulated amount isn't known before the first update + # call, the internal variables aren't tensors on the right device yet. + + y_pred = torch.randint(0, 2, size=(10,)) + y = torch.randint(0, 2, size=(10,)).long() + pr.update((y_pred, y)) - assert ( - pr._numerator.device == metric_device - ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}" + assert pr._updated is True - if average != "samples": - # For average='samples', `_denominator` is of type `int` so it has not `device` member. assert ( - pr._denominator.device == metric_device - ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}" - - if average == "weighted": - assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs " - f"{type(metric_device)}:{metric_device}" - - metric_devices = [torch.device("cpu")] - device = idist.device() - if device.type != "xla": - metric_devices.append(idist.device()) - for metric_device in metric_devices: - _test(False, metric_device=metric_device) - _test("macro", metric_device=metric_device) - _test("micro", metric_device=metric_device) - _test("weighted", metric_device=metric_device) - - -def test_distrib_multilabel_accumulator_device(distributed): - # Multiclass input data of shape (N, ) and (N, C) + pr._numerator.device == metric_device + ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}" + + if average != "samples": + # For average='samples', `_denominator` is of type `int` so it has not `device` member. + assert ( + pr._denominator.device == metric_device + ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}" + + if average == "weighted": + assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs " + f"{type(metric_device)}:{metric_device}" + + metric_devices = [torch.device("cpu")] + device = idist.device() + if device.type != "xla": + metric_devices.append(idist.device()) + for metric_device in metric_devices: + _test(False, metric_device=metric_device) + _test("macro", metric_device=metric_device) + _test("micro", metric_device=metric_device) + _test("weighted", metric_device=metric_device) - def _test(average, metric_device): - pr = Precision(is_multilabel=True, average=average, device=metric_device) + def test_multilabel_accumulator_device(self): + # Multiclass input data of shape (N, ) and (N, C) - assert pr._updated is False - assert pr._device == metric_device + def _test(average, metric_device): + pr = Precision(is_multilabel=True, average=average, device=metric_device) - y_pred = torch.randint(0, 2, size=(10, 4, 20, 23)) - y = torch.randint(0, 2, size=(10, 4, 20, 23)).long() - pr.update((y_pred, y)) + assert pr._updated is False + assert pr._device == metric_device - assert pr._updated is True + y_pred = torch.randint(0, 2, size=(10, 4, 20, 23)) + y = torch.randint(0, 2, size=(10, 4, 20, 23)).long() + pr.update((y_pred, y)) - assert ( - pr._numerator.device == metric_device - ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}" + assert pr._updated is True - if average != "samples": - # For average='samples', `_denominator` is of type `int` so it has not `device` member. assert ( - pr._denominator.device == metric_device - ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}" - - if average == "weighted": - assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs " - f"{type(metric_device)}:{metric_device}" - - metric_devices = [torch.device("cpu")] - device = idist.device() - if device.type != "xla": - metric_devices.append(idist.device()) - for metric_device in metric_devices: - _test(False, metric_device=metric_device) - _test("macro", metric_device=metric_device) - _test("micro", metric_device=metric_device) - _test("weighted", metric_device=metric_device) - _test("samples", metric_device=metric_device) + pr._numerator.device == metric_device + ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}" + + if average != "samples": + # For average='samples', `_denominator` is of type `int` so it has not `device` member. + assert ( + pr._denominator.device == metric_device + ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}" + + if average == "weighted": + assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs " + f"{type(metric_device)}:{metric_device}" + + metric_devices = [torch.device("cpu")] + device = idist.device() + if device.type != "xla": + metric_devices.append(idist.device()) + for metric_device in metric_devices: + _test(False, metric_device=metric_device) + _test("macro", metric_device=metric_device) + _test("micro", metric_device=metric_device) + _test("weighted", metric_device=metric_device) + _test("samples", metric_device=metric_device) diff --git a/tests/ignite/metrics/test_psnr.py b/tests/ignite/metrics/test_psnr.py index ec85111da3e9..1bd06e3e2cb7 100644 --- a/tests/ignite/metrics/test_psnr.py +++ b/tests/ignite/metrics/test_psnr.py @@ -112,119 +112,117 @@ def update(engine, i): assert np.allclose(result, np_psnr / np_y.shape[0], atol=atol) -def test_distrib_input_float(distributed): - device = idist.device() - - def get_test_cases(): - - y_pred = torch.rand(n_iters * batch_size, 2, 2, device=device) - y = y_pred * 0.65 - - return y_pred, y - - n_iters = 100 - batch_size = 10 - - rank = idist.get_rank() - for i in range(3): - # check multiple random inputs as random exact occurencies are rare - torch.manual_seed(42 + rank + i) - y_pred, y = get_test_cases() - _test(y_pred, y, 1, "cpu", n_iters, batch_size, atol=1e-8) - if device.type != "xla": - _test(y_pred, y, 1, idist.device(), n_iters, batch_size, atol=1e-8) - - -def test_distrib_multilabel_input_YCbCr(distributed): - device = idist.device() - - def get_test_cases(): - - y_pred = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device) - cbcr_pred = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device) - y = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device) - cbcr = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device) - - y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1) - - return y_pred, y - - n_iters = 100 - batch_size = 10 - - def out_fn(x): - return x[0][:, 0, ...], x[1][:, 0, ...] - - rank = idist.get_rank() - for i in range(3): - # check multiple random inputs as random exact occurencies are rare - torch.manual_seed(42 + rank + i) - y_pred, y = get_test_cases() - _test(y_pred, y, 220, "cpu", n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True) - if device.type != "xla": - dev = idist.device() - _test(y_pred, y, 220, dev, n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True) - - -def test_distrib_multilabel_input_uint8(distributed): - device = idist.device() - - def get_test_cases(): - - y_pred = torch.randint(0, 256, (n_iters * batch_size, 3, 16, 16), device=device, dtype=torch.uint8) - y = (y_pred * 0.65).to(torch.uint8) - - return y_pred, y - - n_iters = 100 - batch_size = 10 - - rank = idist.get_rank() - for i in range(3): - # check multiple random inputs as random exact occurencies are rare - torch.manual_seed(42 + rank + i) - y_pred, y = get_test_cases() - _test(y_pred, y, 100, "cpu", n_iters, batch_size, atol=1e-8) - if device.type != "xla": - _test(y_pred, y, 100, idist.device(), n_iters, batch_size, atol=1e-8) - - -def test_distrib_multilabel_input_NHW(distributed): - device = idist.device() - - def get_test_cases(): - - y_pred = torch.rand(n_iters * batch_size, 28, 28, device=device) - y = y_pred * 0.8 - - return y_pred, y - - n_iters = 100 - batch_size = 10 - - rank = idist.get_rank() - for i in range(3): - # check multiple random inputs as random exact occurencies are rare - torch.manual_seed(42 + rank + i) - y_pred, y = get_test_cases() - _test(y_pred, y, 10, "cpu", n_iters, batch_size, atol=1e-8) - if device.type != "xla": - _test(y_pred, y, 10, idist.device(), n_iters, batch_size, atol=1e-8) - - -def test_distrib_accumulator_device(distributed): - device = idist.device() - metric_devices = [torch.device("cpu")] - if torch.device(device).type != "xla": - metric_devices.append(idist.device()) - - for metric_device in metric_devices: - psnr = PSNR(data_range=1.0, device=metric_device) - dev = psnr._device - assert dev == metric_device, f"{dev} vs {metric_device}" - - y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device) - y = y_pred * 0.65 - psnr.update((y_pred, y)) - dev = psnr._sum_of_batchwise_psnr.device - assert dev == metric_device, f"{dev} vs {metric_device}" +@pytest.mark.usefixtures("distributed") +class TestDistributed: + def test_input_float(self): + device = idist.device() + + def get_test_cases(): + y_pred = torch.rand(n_iters * batch_size, 2, 2, device=device) + y = y_pred * 0.65 + + return y_pred, y + + n_iters = 100 + batch_size = 10 + + rank = idist.get_rank() + for i in range(3): + # check multiple random inputs as random exact occurencies are rare + torch.manual_seed(42 + rank + i) + y_pred, y = get_test_cases() + _test(y_pred, y, 1, "cpu", n_iters, batch_size, atol=1e-8) + if device.type != "xla": + _test(y_pred, y, 1, idist.device(), n_iters, batch_size, atol=1e-8) + + def test_multilabel_input_YCbCr(self): + device = idist.device() + + def get_test_cases(): + y_pred = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device) + cbcr_pred = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device) + y = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device) + cbcr = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device) + + y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1) + + return y_pred, y + + n_iters = 100 + batch_size = 10 + + def out_fn(x): + return x[0][:, 0, ...], x[1][:, 0, ...] + + rank = idist.get_rank() + for i in range(3): + # check multiple random inputs as random exact occurencies are rare + torch.manual_seed(42 + rank + i) + y_pred, y = get_test_cases() + _test( + y_pred, y, 220, "cpu", n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True + ) + if device.type != "xla": + dev = idist.device() + _test( + y_pred, y, 220, dev, n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True + ) + + def test_multilabel_input_uint8(self): + device = idist.device() + + def get_test_cases(): + y_pred = torch.randint(0, 256, (n_iters * batch_size, 3, 16, 16), device=device, dtype=torch.uint8) + y = (y_pred * 0.65).to(torch.uint8) + + return y_pred, y + + n_iters = 100 + batch_size = 10 + + rank = idist.get_rank() + for i in range(3): + # check multiple random inputs as random exact occurencies are rare + torch.manual_seed(42 + rank + i) + y_pred, y = get_test_cases() + _test(y_pred, y, 100, "cpu", n_iters, batch_size, atol=1e-8) + if device.type != "xla": + _test(y_pred, y, 100, idist.device(), n_iters, batch_size, atol=1e-8) + + def test_multilabel_input_NHW(self): + device = idist.device() + + def get_test_cases(): + y_pred = torch.rand(n_iters * batch_size, 28, 28, device=device) + y = y_pred * 0.8 + + return y_pred, y + + n_iters = 100 + batch_size = 10 + + rank = idist.get_rank() + for i in range(3): + # check multiple random inputs as random exact occurencies are rare + torch.manual_seed(42 + rank + i) + y_pred, y = get_test_cases() + _test(y_pred, y, 10, "cpu", n_iters, batch_size, atol=1e-8) + if device.type != "xla": + _test(y_pred, y, 10, idist.device(), n_iters, batch_size, atol=1e-8) + + def test_accumulator_device(self): + device = idist.device() + metric_devices = [torch.device("cpu")] + if torch.device(device).type != "xla": + metric_devices.append(idist.device()) + + for metric_device in metric_devices: + psnr = PSNR(data_range=1.0, device=metric_device) + dev = psnr._device + assert dev == metric_device, f"{dev} vs {metric_device}" + + y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device) + y = y_pred * 0.65 + psnr.update((y_pred, y)) + dev = psnr._sum_of_batchwise_psnr.device + assert dev == metric_device, f"{dev} vs {metric_device}" diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index 8aae0df95ec7..de6717d00760 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -28,7 +28,6 @@ def test_no_update(): def test_average_parameter(): - re = Recall(average="samples") with pytest.raises( ValueError, match=r"Argument average='samples' is incompatible with binary and multiclass input data." @@ -107,7 +106,6 @@ def ignite_average_to_scikit_average(average, data_type: str): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"]) def test_binary_input(average): - re = Recall(average=average) assert re._updated is False @@ -134,7 +132,6 @@ def _test(y_pred, y, batch_size): assert recall_score(np_y, np_y_pred, average=sk_average_parameter, labels=[0, 1]) == pytest.approx(re_compute) def get_test_cases(): - test_cases = [ # Binary accuracy on input of shape (N, 1) or (N, ) (torch.randint(0, 2, size=(10,)), torch.randint(0, 2, size=(10,)), 1), @@ -227,7 +224,6 @@ def test_multiclass_wrong_inputs(): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"]) def test_multiclass_input(average): - re = Recall(average=average) assert re._updated is False @@ -258,7 +254,6 @@ def _test(y_pred, y, batch_size): assert sk_compute == pytest.approx(re_compute) def get_test_cases(): - test_cases = [ # Multiclass input data of shape (N, ) and (N, C) (torch.rand(10, 6), torch.randint(0, 6, size=(10,)), 1), @@ -328,7 +323,6 @@ def to_numpy_multilabel(y): @pytest.mark.parametrize("average", [None, False, "macro", "micro", "samples"]) def test_multilabel_input(average): - re = Recall(average=average, is_multilabel=True) assert re._updated is False @@ -356,7 +350,6 @@ def _test(y_pred, y, batch_size): assert recall_score(np_y, np_y_pred, average=sk_average_parameter) == pytest.approx(re_compute) def get_test_cases(): - test_cases = [ # Multilabel input data of shape (N, C) (torch.randint(0, 2, size=(10, 5)), torch.randint(0, 2, size=(10, 5)), 1), @@ -427,7 +420,6 @@ def test_incorrect_y_classes(average): def _test_distrib_integration_multiclass(device): - from ignite.engine import Engine def _test(average, n_epochs, metric_device): @@ -489,7 +481,6 @@ def update(engine, i): def _test_distrib_integration_multilabel(device): - from ignite.engine import Engine torch.manual_seed(12) @@ -642,7 +633,6 @@ def _test(average, metric_device): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -653,7 +643,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -665,7 +654,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -679,7 +667,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -691,7 +678,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py index ed6cfa5bc5bc..ebdd84aa08d4 100644 --- a/tests/ignite/metrics/test_root_mean_squared_error.py +++ b/tests/ignite/metrics/test_root_mean_squared_error.py @@ -30,7 +30,6 @@ def test_data(request): @pytest.mark.parametrize("n_times", range(3)) def test_compute(n_times, test_data): - rmse = RootMeanSquaredError() y_pred, y, batch_size = test_data @@ -54,7 +53,6 @@ def test_compute(n_times, test_data): def _test_distrib_integration(device, tol=1e-6): - from ignite.engine import Engine rank = idist.get_rank() @@ -98,7 +96,6 @@ def update(engine, i): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) @@ -106,7 +103,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -115,7 +111,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -126,7 +121,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) @@ -135,7 +129,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py index 9d034c1c781b..dc434bda636d 100644 --- a/tests/ignite/metrics/test_running_average.py +++ b/tests/ignite/metrics/test_running_average.py @@ -1,5 +1,6 @@ -import os +import warnings from functools import partial +from itertools import accumulate import numpy as np import pytest @@ -8,6 +9,7 @@ import ignite.distributed as idist from ignite.engine import Engine, Events from ignite.metrics import Accuracy, RunningAverage +from ignite.metrics.metric import RunningBatchWise, RunningEpochWise, SingleEpochRunningBatchWise def test_wrong_input_args(): @@ -26,171 +28,156 @@ def test_wrong_input_args(): with pytest.raises(ValueError, match=r"Argument device should be None if src is a Metric"): RunningAverage(Accuracy(), device="cpu") + with pytest.warns(UserWarning, match=r"`epoch_bound` is deprecated and will be removed in the future."): + m = RunningAverage(Accuracy(), epoch_bound=True) -def test_integration(): - n_iters = 100 +@pytest.mark.filterwarnings("ignore") +@pytest.mark.parametrize("epoch_bound, usage", [(False, RunningBatchWise()), (True, SingleEpochRunningBatchWise())]) +def test_epoch_bound(epoch_bound, usage): + with warnings.catch_warnings(): + metric = RunningAverage(output_transform=lambda _: _, epoch_bound=epoch_bound) + e1 = Engine(lambda _, __: None) + e2 = Engine(lambda _, __: None) + metric.attach(e1, "") + metric.epoch_bound = None + metric.attach(e2, "", usage) + e1._event_handlers == e2._event_handlers + + +@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise()]) +def test_integration_batchwise(usage): + torch.manual_seed(10) + alpha = 0.98 + n_iters = 10 batch_size = 10 n_classes = 10 - y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size))) - y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) - loss_values = iter(range(n_iters)) + max_epochs = 3 + data = list(range(n_iters)) + loss = torch.arange(n_iters, dtype=torch.float) + y_true = torch.randint(0, n_classes, size=(n_iters, batch_size)) + y_pred = torch.rand(n_iters, batch_size, n_classes) + + accuracy_running_averages = torch.tensor( + list( + accumulate( + map( + lambda y_yp: torch.sum(y_yp[1].argmax(dim=-1) == y_yp[0]).item() / y_yp[0].size(0), + zip( + y_true if isinstance(usage, SingleEpochRunningBatchWise) else y_true.repeat(max_epochs, 1), + y_pred if isinstance(usage, SingleEpochRunningBatchWise) else y_pred.repeat(max_epochs, 1, 1), + ), + ), + lambda ra, acc: ra * alpha + (1 - alpha) * acc, + ) + ) + ) + if isinstance(usage, SingleEpochRunningBatchWise): + accuracy_running_averages = accuracy_running_averages.repeat(max_epochs) + + loss_running_averages = torch.tensor( + list( + accumulate( + loss if isinstance(usage, SingleEpochRunningBatchWise) else loss.repeat(max_epochs), + lambda ra, loss_item: ra * alpha + (1 - alpha) * loss_item, + ) + ) + ) + if isinstance(usage, SingleEpochRunningBatchWise): + loss_running_averages = loss_running_averages.repeat(max_epochs) - def update_fn(engine, batch): - loss_value = next(loss_values) - y_true_batch = next(y_true_batch_values) - y_pred_batch = next(y_pred_batch_values) - return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) + def update_fn(_, i): + loss_value = loss[i] + y_true_batch = y_true[i] + y_pred_batch = y_pred[i] + return loss_value, y_pred_batch, y_true_batch trainer = Engine(update_fn) - alpha = 0.98 acc_metric = RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha) - acc_metric.attach(trainer, "running_avg_accuracy") + acc_metric.attach(trainer, "running_avg_accuracy", usage) avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha) - avg_output.attach(trainer, "running_avg_output") - - running_avg_acc = [ - None, - ] - - @trainer.on(Events.ITERATION_COMPLETED) - def manual_running_avg_acc(engine): - _, y_pred, y = engine.state.output - indices = torch.max(y_pred, 1)[1] - correct = torch.eq(indices, y).view(-1) - num_correct = torch.sum(correct).item() - num_examples = correct.shape[0] - batch_acc = num_correct * 1.0 / num_examples - if running_avg_acc[0] is None: - running_avg_acc[0] = batch_acc - else: - running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc - engine.state.running_avg_acc = running_avg_acc[0] - - @trainer.on(Events.EPOCH_STARTED) - def running_avg_output_init(engine): - engine.state.running_avg_output = None + avg_output.attach(trainer, "running_avg_loss", usage) - @trainer.on(Events.ITERATION_COMPLETED) - def running_avg_output_update(engine): - if engine.state.running_avg_output is None: - engine.state.running_avg_output = engine.state.output[0] - else: - engine.state.running_avg_output = ( - engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0] - ) + metric_acc_running_averages = [] + metric_loss_running_averages = [] @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_acc_values(engine): - assert ( - engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] - ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" - - @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_output_values(engine): - assert ( - engine.state.running_avg_output == engine.state.metrics["running_avg_output"] - ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" - - np.random.seed(10) - running_avg_acc = [ - None, - ] - n_iters = 10 - batch_size = 10 - n_classes = 10 - data = list(range(n_iters)) - loss_values = iter(range(n_iters)) - y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size))) - y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) - trainer.run(data, max_epochs=1) - - running_avg_acc = [ - None, - ] - n_iters = 10 - batch_size = 10 - n_classes = 10 - data = list(range(n_iters)) - loss_values = iter(range(n_iters)) - y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size))) - y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes)) - trainer.run(data, max_epochs=1) - + def _(engine): + metric_acc_running_averages.append(engine.state.metrics["running_avg_accuracy"]) + metric_loss_running_averages.append(engine.state.metrics["running_avg_loss"]) -def test_epoch_unbound(): + trainer.run(data, max_epochs=3) + assert (torch.tensor(metric_acc_running_averages) == accuracy_running_averages).all() + assert (torch.tensor(metric_loss_running_averages) == loss_running_averages).all() + + metric_state = acc_metric.state_dict() + saved__value = acc_metric._value + saved_src__num_correct = acc_metric.src._num_correct + saved_src__num_examples = acc_metric.src._num_examples + acc_metric.reset() + acc_metric.load_state_dict(metric_state) + assert acc_metric._value == saved__value + assert acc_metric.src._num_examples == saved_src__num_examples + assert (acc_metric.src._num_correct == saved_src__num_correct).all() + + metric_state = avg_output.state_dict() + saved__value = avg_output._value + assert avg_output.src is None + avg_output.reset() + avg_output.load_state_dict(metric_state) + assert avg_output._value == saved__value + assert avg_output.src is None + + +def test_integration_epochwise(): + torch.manual_seed(10) + alpha = 0.98 n_iters = 10 - n_epochs = 3 batch_size = 10 n_classes = 10 + max_epochs = 3 data = list(range(n_iters)) - loss_values = iter(range(n_epochs * n_iters)) - y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_epochs * n_iters, batch_size))) - y_pred_batch_values = iter(np.random.rand(n_epochs * n_iters, batch_size, n_classes)) + y_true = torch.randint(0, n_classes, size=(n_iters, batch_size)) + y_pred = torch.rand(max_epochs, n_iters, batch_size, n_classes) + + accuracy_running_averages = torch.tensor( + list( + accumulate( + map( + lambda y_pred_epoch: torch.sum(y_pred_epoch.argmax(dim=-1) == y_true).item() / y_true.numel(), + y_pred, + ), + lambda ra, acc: ra * alpha + (1 - alpha) * acc, + ) + ) + ) - def update_fn(engine, batch): - loss_value = next(loss_values) - y_true_batch = next(y_true_batch_values) - y_pred_batch = next(y_pred_batch_values) - return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) + def update_fn(engine, i): + y_true_batch = y_true[i] + y_pred_batch = y_pred[engine.state.epoch - 1, i] + return y_pred_batch, y_true_batch trainer = Engine(update_fn) - alpha = 0.98 - - acc_metric = RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha, epoch_bound=False) - acc_metric.attach(trainer, "running_avg_accuracy") - avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha, epoch_bound=False) - avg_output.attach(trainer, "running_avg_output") + acc_metric = RunningAverage(Accuracy(), alpha=alpha) + acc_metric.attach(trainer, "running_avg_accuracy", RunningEpochWise()) - running_avg_acc = [None] + metric_acc_running_averages = [] - @trainer.on(Events.STARTED) - def running_avg_output_init(engine): - engine.state.running_avg_output = None - - @trainer.on(Events.ITERATION_COMPLETED, running_avg_acc) - def manual_running_avg_acc(engine, running_avg_acc): - _, y_pred, y = engine.state.output - indices = torch.max(y_pred, 1)[1] - correct = torch.eq(indices, y).view(-1) - num_correct = torch.sum(correct).item() - num_examples = correct.shape[0] - batch_acc = num_correct * 1.0 / num_examples - if running_avg_acc[0] is None: - running_avg_acc[0] = batch_acc - else: - running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc - engine.state.running_avg_acc = running_avg_acc[0] - - @trainer.on(Events.ITERATION_COMPLETED) - def running_avg_output_update(engine): - if engine.state.running_avg_output is None: - engine.state.running_avg_output = engine.state.output[0] - else: - engine.state.running_avg_output = ( - engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0] - ) - - @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_acc_values(engine): - assert ( - engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] - ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" - - @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_output_values(engine): - assert ( - engine.state.running_avg_output == engine.state.metrics["running_avg_output"] - ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" + @trainer.on(Events.EPOCH_COMPLETED) + def _(engine): + metric_acc_running_averages.append(engine.state.metrics["running_avg_accuracy"]) trainer.run(data, max_epochs=3) + assert (torch.tensor(metric_acc_running_averages) == accuracy_running_averages).all() + -def test_multiple_attach(): +@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()]) +def test_multiple_attach(usage): n_iters = 100 errD_values = iter(np.random.rand(n_iters)) errG_values = iter(np.random.rand(n_iters)) @@ -214,11 +201,10 @@ def update_fn(engine, batch): monitoring_metrics = ["errD", "errG", "D_x", "D_G_z1", "D_G_z2"] for metric in monitoring_metrics: foo = partial(lambda x, metric: x[metric], metric=metric) - RunningAverage(alpha=alpha, output_transform=foo).attach(trainer, metric) + RunningAverage(alpha=alpha, output_transform=foo).attach(trainer, metric, usage) - @trainer.on(Events.ITERATION_COMPLETED) + @trainer.on(usage.COMPLETED) def check_values(engine): - values = [] for metric in monitoring_metrics: values.append(engine.state.metrics[metric]) @@ -230,8 +216,23 @@ def check_values(engine): trainer.run(data) -def test_output_is_tensor(): +@pytest.mark.filterwarnings("ignore") +@pytest.mark.parametrize("epoch_bound", [True, False, None]) +@pytest.mark.parametrize("src", [Accuracy(), None]) +@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()]) +def test_detach(epoch_bound, src, usage): + with warnings.catch_warnings(): + m = RunningAverage(src, output_transform=(lambda _: _) if src is None else None, epoch_bound=epoch_bound) + e = Engine(lambda _, __: None) + m.attach(e, "m", usage) + for event_handlers in e._event_handlers.values(): + assert len(event_handlers) != 0 + m.detach(e, usage) + for event_handlers in e._event_handlers.values(): + assert len(event_handlers) == 0 + +def test_output_is_tensor(): m = RunningAverage(output_transform=lambda x: x) m.update(torch.rand(10, requires_grad=True).mean()) v = m.compute() @@ -249,222 +250,147 @@ def test_output_is_tensor(): assert not v.requires_grad -def _test_distrib_on_output(device): +@pytest.mark.usefixtures("distributed") +class TestDistributed: + @pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise()]) + def test_src_is_output(self, usage): + device = idist.device() + rank = idist.get_rank() + n_iters = 10 + n_epochs = 3 - rank = idist.get_rank() - n_iters = 10 - n_epochs = 3 - batch_size = 10 - - # Data per rank - data = list(range(n_iters)) - k = n_epochs * batch_size * n_iters - all_loss_values = torch.arange(0, k * idist.get_world_size(), dtype=torch.float64).to(device) - loss_values = iter(all_loss_values[k * rank : k * (rank + 1)]) - - def update_fn(engine, batch): - loss_value = next(loss_values) - return loss_value.item() - - trainer = Engine(update_fn) - alpha = 0.98 - - metric_device = idist.device() if torch.device(device).type != "xla" else "cpu" - avg_output = RunningAverage(output_transform=lambda x: x, alpha=alpha, epoch_bound=False, device=metric_device) - avg_output.attach(trainer, "running_avg_output") - - @trainer.on(Events.STARTED) - def running_avg_output_init(engine): - engine.state.running_avg_output = None - - @trainer.on(Events.ITERATION_COMPLETED) - def running_avg_output_update(engine): - i = engine.state.iteration - 1 - o = sum([all_loss_values[i + j * k] for j in range(idist.get_world_size())]).item() - o /= idist.get_world_size() - if engine.state.running_avg_output is None: - engine.state.running_avg_output = o - else: - engine.state.running_avg_output = engine.state.running_avg_output * alpha + (1.0 - alpha) * o - - @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_output_values(engine): - it = engine.state.iteration - assert engine.state.running_avg_output == pytest.approx( - engine.state.metrics["running_avg_output"] - ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" - - trainer.run(data, max_epochs=3) - - -def _test_distrib_on_metric(device): - - rank = idist.get_rank() - n_iters = 10 - n_epochs = 3 - batch_size = 10 - n_classes = 10 - - def _test(metric_device): + # Data per rank data = list(range(n_iters)) - np.random.seed(12) - all_y_true_batch_values = np.random.randint( - 0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size) - ) - all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes) - - y_true_batch_values = iter(all_y_true_batch_values[rank, ...]) - y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...]) + rank_loss_count = n_epochs * n_iters + all_loss_values = torch.arange(0, rank_loss_count * idist.get_world_size(), dtype=torch.float64).to(device) + loss_values = iter(all_loss_values[rank_loss_count * rank : rank_loss_count * (rank + 1)]) def update_fn(engine, batch): - y_true_batch = next(y_true_batch_values) - y_pred_batch = next(y_pred_batch_values) - return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) + loss_value = next(loss_values) + return loss_value.item() trainer = Engine(update_fn) alpha = 0.98 - acc_metric = RunningAverage( - Accuracy(output_transform=lambda x: [x[0], x[1]], device=metric_device), alpha=alpha, epoch_bound=False - ) - acc_metric.attach(trainer, "running_avg_accuracy") + metric_device = device if device.type != "xla" else "cpu" + avg_output = RunningAverage(output_transform=lambda x: x, alpha=alpha, device=metric_device) + avg_output.attach(trainer, "running_avg_output", usage) - running_avg_acc = [ - None, - ] - true_acc_metric = Accuracy(device=metric_device) + @trainer.on(usage.STARTED) + def reset_running_avg_output(engine): + engine.state.running_avg_output = None - @trainer.on(Events.ITERATION_COMPLETED) - def manual_running_avg_acc(engine): + @trainer.on(usage.ITERATION_COMPLETED) + def running_avg_output_update(engine): i = engine.state.iteration - 1 - - true_acc_metric.reset() - for j in range(idist.get_world_size()): - output = ( - torch.from_numpy(all_y_pred_batch_values[j, i, :, :]), - torch.from_numpy(all_y_true_batch_values[j, i, :]), - ) - true_acc_metric.update(output) - - batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples - - if running_avg_acc[0] is None: - running_avg_acc[0] = batch_acc + o = sum([all_loss_values[i + r * rank_loss_count] for r in range(idist.get_world_size())]).item() + o /= idist.get_world_size() + if engine.state.running_avg_output is None: + engine.state.running_avg_output = o else: - running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc - engine.state.running_avg_acc = running_avg_acc[0] + engine.state.running_avg_output = engine.state.running_avg_output * alpha + (1.0 - alpha) * o - @trainer.on(Events.ITERATION_COMPLETED) - def assert_equal_running_avg_acc_values(engine): + @trainer.on(usage.COMPLETED) + def assert_equal_running_avg_output_values(engine): + it = engine.state.iteration assert ( - engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] - ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" + engine.state.running_avg_output == engine.state.metrics["running_avg_output"] + ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}" trainer.run(data, max_epochs=3) - _test("cpu") - if device.type != "xla": - _test(idist.device()) - - -def _test_distrib_accumulator_device(device): - - metric_devices = [torch.device("cpu")] - if device.type != "xla": - metric_devices.append(idist.device()) - for metric_device in metric_devices: - - # Don't test the src=Metric case because compute() returns a scalar, - # so the metric doesn't accumulate on the device specified - avg = RunningAverage(output_transform=lambda x: x, device=metric_device) - assert avg._device == metric_device - # Value is None until the first update then compute call - - for _ in range(3): - avg.update(torch.tensor(1.0, device=device)) - avg.compute() - - assert ( - avg._value.device == metric_device - ), f"{type(avg._value.device)}:{avg._value.device} vs {type(metric_device)}:{metric_device}" - - -@pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.distributed -@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") -@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") -def test_distrib_hvd(gloo_hvd_executor): - - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") - nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() - - gloo_hvd_executor(_test_distrib_on_output, (device,), np=nproc, do_init=True) - gloo_hvd_executor(_test_distrib_on_metric, (device,), np=nproc, do_init=True) - gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.tpu -@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") -@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") -def test_distrib_single_device_xla(): - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -def _test_distrib_xla_nprocs(index): - device = idist.device() - _test_distrib_on_output(device) - _test_distrib_on_metric(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.tpu -@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars") -@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") -def test_distrib_xla_nprocs(xmp_executor): - n = int(os.environ["NUM_TPU_WORKERS"]) - xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + @pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()]) + def test_src_is_metric(self, usage): + device = idist.device() + rank = idist.get_rank() + n_iters = 10 + n_epochs = 3 + batch_size = 10 + n_classes = 10 + + def _test(metric_device): + data = list(range(n_iters)) + np.random.seed(12) + all_y_true_batch_values = np.random.randint( + 0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size) + ) + all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes) + + y_true_batch_values = iter(all_y_true_batch_values[rank, ...]) + y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...]) + + def update_fn(engine, batch): + y_true_batch = next(y_true_batch_values) + y_pred_batch = next(y_pred_batch_values) + return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch) + + trainer = Engine(update_fn) + alpha = 0.98 + + acc_metric = RunningAverage(Accuracy(device=metric_device), alpha=alpha) + acc_metric.attach(trainer, "running_avg_accuracy", usage) + + running_avg_acc = [ + None, + ] + true_acc_metric = Accuracy(device=metric_device) + + @trainer.on(Events.ITERATION_COMPLETED) + def manual_running_avg_acc(engine): + iteration = engine.state.iteration + + if not isinstance(usage, RunningEpochWise) or ((iteration - 1) % n_iters) == 0: + true_acc_metric.reset() + if ((iteration - 1) % n_iters) == 0 and isinstance(usage, SingleEpochRunningBatchWise): + running_avg_acc[0] = None + for j in range(idist.get_world_size()): + output = ( + torch.from_numpy(all_y_pred_batch_values[j, iteration - 1, :, :]), + torch.from_numpy(all_y_true_batch_values[j, iteration - 1, :]), + ) + true_acc_metric.update(output) + + if not isinstance(usage, RunningEpochWise) or (iteration % n_iters) == 0: + batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples + + if running_avg_acc[0] is None: + running_avg_acc[0] = batch_acc + else: + running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc + engine.state.running_avg_acc = running_avg_acc[0] + + @trainer.on(Events.ITERATION_COMPLETED) + def assert_equal_running_avg_acc_values(engine): + print(engine.state.iteration) + if not isinstance(usage, RunningEpochWise) or ( + (engine.state.iteration > 1) and ((engine.state.iteration % n_iters) == 1) + ): + assert ( + engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"] + ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}" + + trainer.run(data, max_epochs=3) + + _test("cpu") + if device.type != "xla": + _test(idist.device()) + + def test_accumulator_device(self): + device = idist.device() + metric_devices = [torch.device("cpu")] + if device.type != "xla": + metric_devices.append(idist.device()) + for metric_device in metric_devices: + # Don't test the src=Metric case because compute() returns a scalar, + # so the metric doesn't accumulate on the device specified + avg = RunningAverage(output_transform=lambda x: x, device=metric_device) + assert avg._device == metric_device + # Value is None until the first update then compute call + + for _ in range(3): + avg.update(torch.tensor(1.0, device=device)) + avg.compute() + + assert ( + avg._value.device == metric_device + ), f"{type(avg._value.device)}:{avg._value.device} vs {type(metric_device)}:{metric_device}" diff --git a/tests/ignite/metrics/test_ssim.py b/tests/ignite/metrics/test_ssim.py index 0c27f119a3be..33f6f3aa3d55 100644 --- a/tests/ignite/metrics/test_ssim.py +++ b/tests/ignite/metrics/test_ssim.py @@ -1,3 +1,5 @@ +from typing import Sequence, Union + import numpy as np import pytest import torch @@ -74,16 +76,45 @@ def test_ssim(available_device, shape, kernel_size, gaussian, use_sample_covaria y_pred = torch.rand(shape, device=available_device) y = y_pred * 0.8 + compare_ssim_ignite_skiimg( + y_pred, + y, + available_device, + kernel_size=kernel_size, + gaussian=gaussian, + use_sample_covariance=use_sample_covariance, + ) + + +def compare_ssim_ignite_skiimg( + y_pred: torch.Tensor, + y: torch.Tensor, + device: torch.device, + precision: float = 2e-5, # default to float32 expected precision + *, + skimg_y_pred: Union[np.ndarray, None] = None, + skimg_y: Union[np.ndarray, None] = None, + data_range: float = 1.0, + kernel_size: Union[int, Sequence[int]] = 11, + gaussian: bool = True, + use_sample_covariance: bool = False, +): sigma = 1.5 - data_range = 1.0 - ssim = SSIM(data_range=data_range, sigma=sigma, device=available_device) + + ssim = SSIM(data_range=data_range, sigma=sigma, device=device) ssim.update((y_pred, y)) ignite_ssim = ssim.compute() - skimg_pred = y_pred.cpu().numpy() - skimg_y = skimg_pred * 0.8 + if y_pred.dtype == torch.bfloat16: + y_pred = y_pred.to(dtype=torch.float16) + + if skimg_y_pred is None: + skimg_y_pred = y_pred.cpu().numpy() + if skimg_y is None: + skimg_y = skimg_y_pred * 0.8 + skimg_ssim = ski_ssim( - skimg_pred, + skimg_y_pred, skimg_y, win_size=kernel_size, sigma=sigma, @@ -94,7 +125,44 @@ def test_ssim(available_device, shape, kernel_size, gaussian, use_sample_covaria ) assert isinstance(ignite_ssim, float) - assert np.allclose(ignite_ssim, skimg_ssim, atol=7e-5) + assert np.allclose(ignite_ssim, skimg_ssim, atol=precision) + + +@pytest.mark.parametrize( + "metric_device, y_pred_device", + [ + [torch.device("cpu"), torch.device("cpu")], + [torch.device("cpu"), torch.device("cuda")], + [torch.device("cuda"), torch.device("cpu")], + [torch.device("cuda"), torch.device("cuda")], + ], +) +def test_ssim_device(available_device, metric_device, y_pred_device): + if available_device == "cpu": + pytest.skip("This test requires a cuda device.") + + data_range = 1.0 + sigma = 1.5 + shape = (12, 5, 256, 256) + + ssim = SSIM(data_range=data_range, sigma=sigma, device=metric_device) + + y_pred = torch.rand(shape, device=y_pred_device) + y = y_pred * 0.8 + + if metric_device == torch.device("cuda") and y_pred_device == torch.device("cpu"): + with pytest.warns(UserWarning): + ssim.update((y_pred, y)) + else: + ssim.update((y_pred, y)) + + if metric_device == torch.device("cuda") or y_pred_device == torch.device("cuda"): + # A tensor will always have the device index set + excepted_device = torch.device("cuda:0") + else: + excepted_device = torch.device("cpu") + + assert ssim._kernel.device == excepted_device def test_ssim_variable_batchsize(available_device): @@ -123,87 +191,152 @@ def test_ssim_variable_batchsize(available_device): assert np.allclose(out, expected) -@pytest.mark.parametrize("metric_device", ["cpu", "process_device"]) -def test_distrib_integration(distributed, metric_device): - from ignite.engine import Engine +def test_ssim_variable_channel(available_device): + y_preds = [ + torch.rand(12, 5, 28, 28, device=available_device), + torch.rand(12, 4, 28, 28, device=available_device), + torch.rand(12, 7, 28, 28, device=available_device), + torch.rand(12, 3, 28, 28, device=available_device), + torch.rand(12, 11, 28, 28, device=available_device), + torch.rand(12, 6, 28, 28, device=available_device), + ] + y_true = [v * 0.8 for v in y_preds] + + for y_pred, y in zip(y_preds, y_true): + compare_ssim_ignite_skiimg(y_pred, y, available_device) + - rank = idist.get_rank() - torch.manual_seed(12 + rank) - n_iters = 100 - batch_size = 10 - device = idist.device() - if metric_device == "process_device": - metric_device = device if device.type != "xla" else "cpu" +@pytest.mark.parametrize( + "dtype, precision", [(torch.bfloat16, 2e-3), (torch.float16, 4e-4), (torch.float32, 2e-5), (torch.float64, 2e-5)] +) +def test_cuda_ssim_dtypes(available_device, dtype, precision): + # Checks https://github.com/pytorch/ignite/pull/3034 + if available_device == "cpu" and dtype in [torch.float16, torch.bfloat16]: + pytest.skip(reason=f"Unsupported dtype {dtype} on CPU device") - y_pred = torch.rand(n_iters * batch_size, 3, 28, 28, dtype=torch.float, device=device) - y = y_pred * 0.65 + shape = (12, 3, 28, 28) - def update(engine, i): - return ( - y_pred[i * batch_size : (i + 1) * batch_size, ...], - y[i * batch_size : (i + 1) * batch_size, ...], - ) + y_pred = torch.rand(shape, device=available_device, dtype=dtype) + y = y_pred * 0.8 - engine = Engine(update) - SSIM(data_range=1.0, device=metric_device).attach(engine, "ssim") + compare_ssim_ignite_skiimg(y_pred, y, available_device, precision) - data = list(range(n_iters)) - engine.run(data=data, max_epochs=1) - y_pred = idist.all_gather(y_pred) - y = idist.all_gather(y) +@pytest.mark.parametrize( + "shape, kernel_size, gaussian, use_sample_covariance", + [[(8, 3, 224, 224), 7, False, True], [(12, 3, 28, 28), 11, True, False]], +) +def test_ssim_uint8(available_device, shape, kernel_size, gaussian, use_sample_covariance): + y_pred = torch.randint(0, 255, shape, device=available_device, dtype=torch.uint8) + y = (y_pred * 0.8).to(dtype=torch.uint8) - assert "ssim" in engine.state.metrics - res = engine.state.metrics["ssim"] + sigma = 1.5 + data_range = 255 + ssim = SSIM(data_range=data_range, sigma=sigma, device=available_device) + ssim.update((y_pred, y)) + ignite_ssim = ssim.compute() - np_pred = y_pred.cpu().numpy() - np_true = np_pred * 0.65 - true_res = ski_ssim( - np_pred, - np_true, - win_size=11, - sigma=1.5, + skimg_pred = y_pred.cpu().numpy() + skimg_y = (skimg_pred * 0.8).astype(np.uint8) + skimg_ssim = ski_ssim( + skimg_pred, + skimg_y, + win_size=kernel_size, + sigma=sigma, channel_axis=1, - gaussian_weights=True, - data_range=1.0, - use_sample_covariance=False, + gaussian_weights=gaussian, + data_range=data_range, + use_sample_covariance=use_sample_covariance, ) - tol = 1e-3 if device.type == "xla" else 1e-4 # Isn't better to ask `distributed` about backend info? + assert isinstance(ignite_ssim, float) + assert np.allclose(ignite_ssim, skimg_ssim, atol=1e-5) + + +@pytest.mark.usefixtures("distributed") +class TestDistributed: + @pytest.mark.parametrize("metric_device", ["cpu", "process_device"]) + def test_integration(self, metric_device): + from ignite.engine import Engine + + rank = idist.get_rank() + torch.manual_seed(12 + rank) + n_iters = 100 + batch_size = 10 + device = idist.device() + if metric_device == "process_device": + metric_device = device if device.type != "xla" else "cpu" + + y_pred = torch.rand(n_iters * batch_size, 3, 28, 28, dtype=torch.float, device=device) + y = y_pred * 0.65 + + def update(engine, i): + return ( + y_pred[i * batch_size : (i + 1) * batch_size, ...], + y[i * batch_size : (i + 1) * batch_size, ...], + ) + + engine = Engine(update) + SSIM(data_range=1.0, device=metric_device).attach(engine, "ssim") + + data = list(range(n_iters)) + engine.run(data=data, max_epochs=1) + + y_pred = idist.all_gather(y_pred) + y = idist.all_gather(y) + + assert "ssim" in engine.state.metrics + res = engine.state.metrics["ssim"] + + np_pred = y_pred.cpu().numpy() + np_true = np_pred * 0.65 + true_res = ski_ssim( + np_pred, + np_true, + win_size=11, + sigma=1.5, + channel_axis=1, + gaussian_weights=True, + data_range=1.0, + use_sample_covariance=False, + ) - assert pytest.approx(res, abs=tol) == true_res + tol = 1e-3 if device.type == "xla" else 1e-4 # Isn't better to ask `distributed` about backend info? - engine = Engine(update) - SSIM(data_range=1.0, gaussian=False, kernel_size=7, device=metric_device).attach(engine, "ssim") + assert pytest.approx(res, abs=tol) == true_res - data = list(range(n_iters)) - engine.run(data=data, max_epochs=1) + engine = Engine(update) + SSIM(data_range=1.0, gaussian=False, kernel_size=7, device=metric_device).attach(engine, "ssim") - assert "ssim" in engine.state.metrics - res = engine.state.metrics["ssim"] + data = list(range(n_iters)) + engine.run(data=data, max_epochs=1) - np_pred = y_pred.cpu().numpy() - np_true = np_pred * 0.65 - true_res = ski_ssim(np_pred, np_true, win_size=7, channel_axis=1, gaussian_weights=False, data_range=1.0) + assert "ssim" in engine.state.metrics + res = engine.state.metrics["ssim"] - assert pytest.approx(res, abs=tol) == true_res + np_pred = y_pred.cpu().numpy() + np_true = np_pred * 0.65 + true_res = ski_ssim(np_pred, np_true, win_size=7, channel_axis=1, gaussian_weights=False, data_range=1.0) + assert pytest.approx(res, abs=tol) == true_res -@pytest.mark.parametrize("metric_device", [torch.device("cpu"), "process_device"]) -def test_distrib_accumulator_device(distributed, metric_device): + @pytest.mark.parametrize("metric_device", [torch.device("cpu"), "process_device"]) + def test_accumulator_device(self, metric_device): + device = idist.device() + if metric_device == "process_device": + metric_device = torch.device(device if device.type != "xla" else "cpu") - device = idist.device() - if metric_device == "process_device": - metric_device = torch.device(device if device.type != "xla" else "cpu") + ssim = SSIM(data_range=1.0, device=metric_device) - ssim = SSIM(data_range=1.0, device=metric_device) + assert ssim._kernel is None + assert isinstance(ssim._kernel_2d, torch.Tensor) - for dev in [ssim._device, ssim._kernel.device]: - assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}" + for dev in [ssim._device, ssim._kernel_2d.device]: + assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}" - y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device) - y = y_pred * 0.65 - ssim.update((y_pred, y)) + y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device) + y = y_pred * 0.65 + ssim.update((y_pred, y)) - dev = ssim._sum_of_ssim.device - assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}" + dev = ssim._sum_of_ssim.device + assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}" diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py index ed34994c6b73..bea4eba418b9 100644 --- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py +++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py @@ -105,12 +105,10 @@ def update(engine, i): def _test_distrib_accumulator_device(device): - metric_devices = [torch.device("cpu")] if device.type != "xla": metric_devices.append(idist.device()) for metric_device in metric_devices: - acc = TopKCategoricalAccuracy(2, device=metric_device) assert acc._device == metric_device assert ( @@ -130,7 +128,6 @@ def _test_distrib_accumulator_device(device): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -139,7 +136,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -149,7 +145,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() @@ -161,7 +156,6 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -171,7 +165,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -181,14 +174,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) def _test_distrib_xla_nprocs(index): - device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/test_utils.py b/tests/ignite/test_utils.py index 93037553d054..c4c65a29d696 100644 --- a/tests/ignite/test_utils.py +++ b/tests/ignite/test_utils.py @@ -100,13 +100,11 @@ def forward(self, x): def test_dist_setup_logger(): - logger = setup_logger("trainer", level=logging.CRITICAL, distributed_rank=1) assert logger.level != logging.CRITICAL def test_setup_logger(capsys, dirname): - trainer = Engine(lambda e, b: None) evaluator = Engine(lambda e, b: None) @@ -118,7 +116,6 @@ def test_setup_logger(capsys, dirname): fp = dirname / "log" def _test(stream): - trainer.logger = setup_logger("trainer", stream=stream, filepath=fp, reset=True) evaluator.logger = setup_logger("evaluator", stream=stream, filepath=fp, reset=True) @@ -158,7 +155,6 @@ def _setup_a_logger_and_dump(name, message): def test_override_setup_logger(capsys): - _setup_a_logger_and_dump(__name__, "test_override_setup_logger") source = capsys.readouterr().err.split("\n") @@ -179,7 +175,6 @@ def test_override_setup_logger(capsys): def test_deprecated(): - # Test on function without docs, @deprecated without reasons @deprecated("0.4.2", "0.6.0") def func_no_docs(): diff --git a/tests/run_code_style.bat b/tests/run_code_style.bat index 8f54943f1c8e..f8ebab0d0a38 100644 --- a/tests/run_code_style.bat +++ b/tests/run_code_style.bat @@ -20,7 +20,7 @@ mypy --config-file mypy.ini goto end :install -pip install --upgrade flake8 "black==21.12b0" "usort==1.0.5" "ufmt==2.0.1" "mypy" +pip install --upgrade flake8 "black==23.9.1" "usort==1.0.7" "ufmt==2.2.0" "mypy" goto end :end diff --git a/tests/run_code_style.sh b/tests/run_code_style.sh index cd28d9c751be..7f6f06546a05 100755 --- a/tests/run_code_style.sh +++ b/tests/run_code_style.sh @@ -10,5 +10,5 @@ elif [ $1 = "fmt" ]; then elif [ $1 = "mypy" ]; then mypy --config-file mypy.ini elif [ $1 = "install" ]; then - pip install --upgrade flake8 "black==21.12b0" "usort==1.0.5" "ufmt==2.0.1" "mypy" + pip install --upgrade flake8 "black==23.9.1" "usort==1.0.7" "ufmt==2.2.0" "mypy" fi diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh index 2e4b57c9628c..2297be94219d 100644 --- a/tests/run_cpu_tests.sh +++ b/tests/run_cpu_tests.sh @@ -18,5 +18,5 @@ if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then fi export WORLD_SIZE=2 -CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv +CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" unset WORLD_SIZE diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh index 6a51995cba34..3146443a531d 100644 --- a/tests/run_gpu_tests.sh +++ b/tests/run_gpu_tests.sh @@ -6,27 +6,30 @@ else ngpus=$1 fi -pattern="" -if [ -n "$2" ]; then - pattern="-k $2" +MATCH_TESTS_EXPRESSION=${2:-""} + +if [ -z "$MATCH_TESTS_EXPRESSION" ]; then + cuda_pattern="cuda" +else + cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION" fi set -xeu -pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k 'cuda' +pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern" # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then exit 0 fi -pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed ${pattern} +pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION" if [ ${ngpus} -gt 1 ]; then export WORLD_SIZE=${ngpus} - pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv + pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION" unset WORLD_SIZE fi