diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index df787f80659d..000000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,459 +0,0 @@
-version: 2.1
-
-parameters:
-  pytorch_stable_image:
-    type: string
-    # https://hub.docker.com/r/pytorch/pytorch/tags
-    default: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime"
-  pytorch_stable_image_devel:
-    type: string
-    # https://hub.docker.com/r/pytorch/pytorch/tags
-    default: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel"
-  workingdir:
-    type: string
-    default: "/tmp/ignite"
-  should_build_docker_images:
-    type: boolean
-    default: false
-  should_publish_docker_images:
-    type: boolean
-    default: false
-
-# -------------------------------------------------------------------------------------
-# Environments to run the jobs in
-# -------------------------------------------------------------------------------------
-
-one_gpu: &one_gpu
-  machine:
-    # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
-    image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1
-    docker_layer_caching: true
-  # https://circleci.com/product/features/resource-classes/#linux-vm
-  resource_class: gpu.nvidia.small
-
-one_gpu_windows: &one_gpu_windows
-  machine:
-    resource_class: windows.gpu.nvidia.medium
-    image: windows-server-2019-nvidia:stable
-    shell: bash.exe
-
-two_gpus: &two_gpus
-  machine:
-    # https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
-    image: ubuntu-2004-cuda-11.4:202110-01 # CUDA v11.4.2, Docker v20.10.7, nvidia-container-toolkit v1.5.1-1
-    docker_layer_caching: true
-  # https://circleci.com/product/features/resource-classes/#linux-vm
-  resource_class: gpu.nvidia.medium
-
-# -------------------------------------------------------------------------------------
-# Re-usable commands
-# -------------------------------------------------------------------------------------
-
-install_latest_nvidia: &install_latest_nvidia
-  - run:
-      name: Install latest NVidia-driver and CUDA
-      command: |
-        sudo apt-get purge nvidia* && sudo apt-get autoremove
-        sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-driver-470
-        # Install nvidia-container-runtime
-        sudo apt-get install -y nvidia-container-runtime
-        # Reload driver : https://stackoverflow.com/a/45319156/6309199
-        # lsof | grep nvidia -> kill Xvfb
-        sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {} || echo "Command 'sudo lsof ...' is failed"
-        # lsmod | grep nvidia
-        sudo rmmod nvidia_uvm && sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset && sudo rmmod nvidia
-        # reload driver
-        nvidia-smi
-
-pull_pytorch_stable_image: &pull_pytorch_stable_image
-  - run:
-      name: Pull PyTorch Stable Image
-      command: |
-        docker pull << pipeline.parameters.pytorch_stable_image >>
-
-pull_pytorch_stable_devel_image: &pull_pytorch_stable_devel_image
-  - run:
-      name: Pull PyTorch Stable Develop Image
-      command: |
-        docker pull << pipeline.parameters.pytorch_stable_image_devel >>
-
-run_pytorch_container: &run_pytorch_container
-  - run:
-      name: Start Pytorch container
-      environment:
-        wd: << pipeline.parameters.workingdir >>
-      command: |
-        docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
-        docker exec -it pthd nvidia-smi
-        docker exec -it pthd ls
-
-run_pytorch_devel_container: &run_pytorch_devel_container
-  - run:
-      name: Start Pytorch dev container
-      environment:
-        wd: << pipeline.parameters.workingdir >>
-      command: |
-        docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
-        docker exec -it pthd nvidia-smi
-        docker exec -it pthd ls
-
-install_dependencies: &install_dependencies
-  - run:
-      name: Install dependencies
-      command: |
-        docker exec -it pthd pip install -r requirements-dev.txt
-        # Commented APEX installation, https://github.com/pytorch/ignite/issues/2299
-        # export install_apex_cmd='pip install -v --disable-pip-version-check --no-cache-dir git+https://github.com/NVIDIA/apex'
-        # export install_git_apex_cmd="apt-get update && apt-get install -y --no-install-recommends git && ${install_apex_cmd}"
-        # docker exec -it pthd /bin/bash -c "$install_git_apex_cmd"
-        export install_ignite_cmd='python setup.py install'
-        docker exec -it pthd /bin/bash -c "$install_ignite_cmd"
-
-# https://github.com/pytorch/ignite/issues/1737
-download_mnist: &download_mnist
-  - run:
-      name: Download MNIST
-      command: |
-        export install_git_cmd="apt-get update && apt-get install -y --no-install-recommends git"
-        docker exec -it pthd /bin/bash -c "$install_git_cmd"
-
-        export tmp_mnist_dir='/tmp/mnist'
-        export tests_mnist_dir='/tmp'
-        export examples_mnist_dir='.'
-        export download_mnist_cmd="git clone https://github.com/pytorch-ignite/download-mnist-github-action.git $tmp_mnist_dir"
-        docker exec -it pthd /bin/bash -c "$download_mnist_cmd"
-        export get_mnist_cmd_tests="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $tests_mnist_dir/MNIST/raw"
-        docker exec -it pthd /bin/bash -c "$get_mnist_cmd_tests"
-        export get_mnist_cmd_examples="python $tmp_mnist_dir/cp.py $tmp_mnist_dir $examples_mnist_dir/MNIST/raw"
-        docker exec -it pthd /bin/bash -c "$get_mnist_cmd_examples"
-
-# -------------------------------------------------------------------------------------
-# Jobs to run
-# -------------------------------------------------------------------------------------
-jobs:
-  one_gpu_tests:
-    <<: *one_gpu
-
-    working_directory: << pipeline.parameters.workingdir >>
-
-    steps:
-      - checkout
-      - run:
-          name: Trigger job if modified
-          command: |
-            bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-      - <<: *pull_pytorch_stable_image
-      - <<: *run_pytorch_container
-      - <<: *install_dependencies
-      - <<: *download_mnist
-      - run:
-          name: Run GPU Unit Tests and Examples
-          command: |
-
-            # pytest on cuda
-            export test_cmd='bash tests/run_gpu_tests.sh'
-            docker exec -it pthd /bin/bash -c "${test_cmd}"
-
-            # MNIST tests
-
-            # 1) mnist.py
-            export minst1_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist.py --epochs=1'
-            docker exec -it pthd /bin/bash -c "$minst1_cmd"
-
-            # 2) mnist_with_visdom.py
-            export visdom_script_cmd='python -c "from visdom.server.build import download_scripts; download_scripts()"'
-            export visdom_cmd='python -m visdom.server'
-            docker exec -d pthd /bin/bash -c "$visdom_script_cmd && $visdom_cmd"
-            export sleep_cmd='sleep 10'
-            export mnist2_cmd='python examples/mnist/mnist_with_visdom.py --epochs=1'
-            docker exec -it pthd /bin/bash -c "$sleep_cmd && $mnist2_cmd"
-
-            # 3.1) mnist_with_tensorboard.py with tbX
-            export mnist3_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_with_tensorboard.py --epochs=1'
-            docker exec -it pthd /bin/bash -c "$mnist3_cmd"
-
-            # uninstall tensorboardX
-            export pip_cmd='pip uninstall -y tensorboardX'
-            docker exec -it pthd /bin/bash -c "$pip_cmd"
-
-            # 3.2) mnist_with_tensorboard.py with native torch tb
-            docker exec -it pthd /bin/bash -c "$mnist3_cmd"
-
-            # 4) mnist_save_resume_engine.py
-            # save
-            export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --crash_iteration 1100'
-            docker exec -it pthd /bin/bash -c "$mnist4_cmd"
-            # resume
-            export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt'
-            docker exec -it pthd /bin/bash -c "$mnist4_cmd"
-
-  one_gpu_windows_tests:
-    <<: *one_gpu_windows
-
-    working_directory: << pipeline.parameters.workingdir >>
-
-    steps:
-      - checkout
-      - run:
-          name: Trigger job if modified
-          command: |
-            bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-
-      # - run:
-      #     name: Update CUDA Driver for Windows
-      #     command: |
-      #       curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh
-      #       mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/"
-      #       JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh
-      #       bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'"
-
-      - run:
-          name: Install dependencies
-          command: |
-            conda --version
-            # We have to use cuda 10.2 on Windows:
-            # https://github.com/pytorch/ignite/issues/1843
-            conda install -y pytorch==1.9.1 torchvision cudatoolkit=10.2 -c pytorch
-            pip install -r requirements-dev.txt
-            pip install .
-            python -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"
-            python -c "import torch; torch.cuda.is_available()"
-
-      - run:
-          # https://github.com/pytorch/ignite/issues/1737
-          name: Download MNIST
-          command: |
-            git clone https://github.com/pytorch-ignite/download-mnist-github-action.git /tmp/mnist
-            python /tmp/mnist/cp.py /tmp/mnist /tmp/MNIST/raw
-
-      - run:
-          name: Run GPU Unit Tests
-          command: |
-            # pytest on cuda
-            SKIP_DISTRIB_TESTS=1 bash tests/run_gpu_tests.sh
-
-  two_gpus_tests:
-    <<: *two_gpus
-
-    working_directory: << pipeline.parameters.workingdir >>
-
-    steps:
-      - checkout
-      - run:
-          name: Trigger job if modified
-          command: |
-            bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-      - <<: *pull_pytorch_stable_image
-      - <<: *run_pytorch_container
-      - <<: *install_dependencies
-      - <<: *download_mnist
-      - run:
-          name: Run 1 Node 2 GPUs Unit Tests
-          command: |
-            export test_cmd='bash tests/run_gpu_tests.sh 2'
-            docker exec -it pthd /bin/bash -c "${test_cmd}"
-
-  two_gpus_check_dist_cifar10_example:
-    <<: *two_gpus
-
-    working_directory: << pipeline.parameters.workingdir >>
-
-    steps:
-      - checkout
-      - run:
-          name: Trigger job if modified
-          command: |
-            bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-      - <<: *pull_pytorch_stable_image
-      - <<: *run_pytorch_container
-      - <<: *install_dependencies
-      - run:
-          name: "Install additional example dependencies"
-          command: |
-            docker exec -it pthd pip install fire
-      - run:
-          name: "Run without backend"
-          command: |
-            export example_path="examples/contrib/cifar10"
-            # initial run
-            export stop_cmd="--stop_iteration=500"
-            export test_cmd="CI=1 python ${example_path}/main.py run --checkpoint_every=200"
-            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
-            # resume
-            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
-            docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
-
-      - run:
-          name: "Run with NCCL backend using torchrun"
-          command: |
-            export example_path="examples/contrib/cifar10"
-            # initial run
-            export stop_cmd="--stop_iteration=500"
-            export test_cmd="CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200"
-            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
-            # resume
-            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-            docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
-
-      - run:
-          name: "Run with NCCL backend using spawn"
-          command: |
-            export example_path="examples/contrib/cifar10"
-            # initial run
-            export stop_cmd="--stop_iteration=500"
-            export test_cmd="CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200"
-            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
-            # resume
-            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-            docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
-
-  two_gpus_hvd_tests:
-    <<: *two_gpus
-
-    working_directory: << pipeline.parameters.workingdir >>
-
-    steps:
-      - checkout
-      - run:
-          name: Trigger job if modified
-          command: |
-            bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-      - <<: *pull_pytorch_stable_devel_image
-      - <<: *run_pytorch_devel_container
-      - <<: *install_dependencies
-      - <<: *download_mnist
-      - run:
-          name: "Install Horovod with NCCL GPU ops"
-          command: |
-
-            # Following https://github.com/horovod/horovod/blob/master/Dockerfile.test.gpu
-            # and https://github.com/horovod/horovod/issues/1944#issuecomment-628192778
-            docker exec -it pthd /bin/bash -c "apt-get update && apt-get install -y git"
-            docker exec -it pthd /bin/bash -c "git clone --recursive https://github.com/horovod/horovod.git -b v0.23.0 /horovod && cd /horovod && python setup.py sdist"
-            docker exec -it pthd /bin/bash -c "conda install -y cmake nccl=2.11 -c conda-forge"
-            docker exec -it pthd /bin/bash -c 'cd /horovod && HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip install -v $(ls /horovod/dist/horovod-*.tar.gz) && ldconfig'
-            docker exec -it pthd horovodrun --check-build
-
-      - run:
-          name: Run 1 Node 2 GPUs Unit Tests
-          command: |
-            export test_cmd='bash tests/run_gpu_tests.sh 2 hvd'
-            docker exec -it pthd /bin/bash -c "${test_cmd}"
-            # no CUDA devices Horovod tests
-            export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd'
-            docker exec -it pthd /bin/bash -c "${test_cmd}"
-
-      - run:
-          name: "Check CIFAR10 using horovodrun"
-          command: |
-            docker exec -it pthd pip install fire
-            export example_path="examples/contrib/cifar10"
-            # initial run
-            export stop_cmd="--stop_iteration=500"
-            export test_cmd="cd ${example_path} && CI=1 horovodrun -np 2 python -u main.py run --backend=horovod --checkpoint_every=200"
-            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
-            # resume
-            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt"
-            docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
-
-      - run:
-          name: "Check CIFAR10 using spawn"
-          command: |
-            export example_path="examples/contrib/cifar10"
-            # initial run
-            export stop_cmd="--stop_iteration=500"
-            export test_cmd="cd ${example_path} && CI=1 python -u main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200"
-            docker exec -it pthd /bin/bash -c "${test_cmd} ${stop_cmd}"
-            # resume
-            export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt"
-            docker exec -it pthd /bin/bash -c "${test_cmd} --num_epochs=7 ${resume_opt}"
-
-  build_publish_docker_images:
-    # https://circleci.com/docs/2.0/building-docker-images/
-    docker:
-      - image: cimg/python:3.8.8
-
-    # https://circleci.com/docs/2.0/executor-types/#available-docker-resource-classes
-    resource_class: 2xlarge
-
-    working_directory: << pipeline.parameters.workingdir >>
-    steps:
-      - checkout
-      - setup_remote_docker:
-          version: 19.03.14
-          docker_layer_caching: true
-      - run:
-          name: Install deps
-          command: |
-            pip --version
-            pip install docker
-      - run:
-          name: Build all Horovod flavoured PyTorch-Ignite images
-          command: |
-            cd docker
-            export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
-            export HVD_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_hvd_version'))"`
-            bash build.sh hvd hvd-base
-            bash build.sh hvd hvd-vision
-            bash build.sh hvd hvd-nlp
-            bash build.sh hvd hvd-apex
-            bash build.sh hvd hvd-apex-vision
-            bash build.sh hvd hvd-apex-nlp
-
-      - run:
-          name: Build all PyTorch-Ignite images
-          command: |
-            cd docker
-            export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
-            bash build.sh main base
-            bash build.sh main vision
-            bash build.sh main nlp
-            bash build.sh main apex
-            bash build.sh main apex-vision
-            bash build.sh main apex-nlp
-
-      - run:
-          name: Build all MS DeepSpeed flavoured PyTorch-Ignite images
-          command: |
-            cd docker
-            export PTH_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_pytorch_version'))"`
-            export MSDP_VERSION=`python -c "import configparser; cfg=configparser.ConfigParser(); cfg.read('docker.cfg'); print(cfg.get('DEFAULT', 'build_docker_image_msdp_version'))"`
-            bash build.sh msdp msdp-apex
-            bash build.sh msdp msdp-apex-vision
-            bash build.sh msdp msdp-apex-nlp
-
-      - run:
-          name: List built images
-          command: docker images | grep pytorchignite
-
-      - when:
-          condition: << pipeline.parameters.should_publish_docker_images >>
-          steps:
-            - run:
-                name: Push all PyTorch-Ignite Docker images
-                command: |
-                  cd docker
-                  sh ./push_all.sh
-
-# -------------------------------------------------------------------------------------
-# Workflows
-# -------------------------------------------------------------------------------------
-workflows:
-  version: 2
-  gpu_tests:
-    unless: << pipeline.parameters.should_build_docker_images >>
-    jobs:
-      - one_gpu_tests
-      # Disabled windows tests as NVidia driver is too old
-      # > c:\tools\miniconda3\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at  ..\c10\cuda\CUDAFunctions.cpp:115.)
-      # > return torch._C._cuda_getDeviceCount() > 0
-      # - one_gpu_windows_tests
-      # Can not run tests on 2 GPUs on Circle-CI
-      # Now, they are running on GHA self-hosted
-      # - two_gpus_tests
-      # - two_gpus_check_dist_cifar10_example
-      # - two_gpus_hvd_tests
-  docker_images:
-    when: << pipeline.parameters.should_build_docker_images >>
-    jobs:
-      - build_publish_docker_images
diff --git a/.circleci/trigger_if_modified.sh b/.circleci/trigger_if_modified.sh
deleted file mode 100644
index 311e00a5991e..000000000000
--- a/.circleci/trigger_if_modified.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-
-# Script is taken from https://circleci.com/developer/orbs/orb/roopakv/swissknife#commands-run_if_modified
-# Usage: sh trigger_if_modified.sh <pattern> [base-branch]
-# - for example: sh trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
-
-if [ -z "$1" ]; then
-  echo "Pattern should be provided. Usage: sh trigger_if_modified.sh <pattern>"
-  exit 1
-fi
-
-pattern=$1
-
-if [ -z "$2" ]; then
-  base_branch=master
-else
-  base_branch=$2
-fi
-
-echo "- Pattern: ${pattern}"
-echo "- Base branch: ${base_branch}"
-
-if [ -z "$BASH" ]; then
-  echo Bash not installed.
-  exit 1
-fi
-
-git status >/dev/null 2>&1 || { echo >&2 "Not in a git directory or no git"; exit 1; }
-
-circleci-agent >/dev/null 2>&1 || { echo >&2 "No Circle CI agent. These are in all Circle CI containers"; exit 1; }
-
-
-if [ "$CIRCLE_BRANCH" == "master" ]; then
-  echo "Skip checking modified files if on master"
-  exit 0
-fi
-
-FILES_MODIFIED=""
-
-setcommit () {
-  FILES_MODIFIED=$(git diff --name-only origin/${base_branch}..HEAD | grep -i -E ${pattern})
-}
-
-setcommit || true
-
-if [ -z "$FILES_MODIFIED" ]; then
-  echo "Files not modified. Halting job"
-  circleci-agent step halt
-else
-  echo "Files modified: ${FILES_MODIFIED}, continuing steps"
-fi
\ No newline at end of file
diff --git a/.github/pr-labeler-config.yml b/.github/pr-labeler-config.yml
index a02ec1183346..393adb66f756 100644
--- a/.github/pr-labeler-config.yml
+++ b/.github/pr-labeler-config.yml
@@ -6,9 +6,8 @@ docker:
 docs:
   - docs/**/*
 
-# Add 'ci' to any changes within 'circleci' and '.github' folder
+# Add 'ci' to any changes in '.github' folder
 ci:
-  - .circleci/**/*
   - .github/**/*
 
 # Add 'examples' to any changes within 'examples' folder
diff --git a/.github/workflows/binaries-nightly-release.yml b/.github/workflows/binaries-nightly-release.yml
index bd19857c345e..0c1bce941709 100644
--- a/.github/workflows/binaries-nightly-release.yml
+++ b/.github/workflows/binaries-nightly-release.yml
@@ -10,13 +10,13 @@ jobs:
   build-publish:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup Miniconda
         uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"
-          python-version: 3.8
+          python-version: "3.10"
 
       - name: Setup nightly version
         run: |
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
index a8dce7e63e24..8627c0ece64f 100644
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -10,7 +10,6 @@ on:
       - "tests/run_code_style.sh"
       - ".github/workflows/code-style.yml"
       - "!assets/**"
-      - "!.circleci/**"
       - "!docker/**"
       - "!docs/**"
       - "!conda.recipe"
@@ -21,7 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - if: github.event_name == 'push'
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
           python-version: "3.8"
diff --git a/.github/workflows/discord_issues.yml b/.github/workflows/discord_issues.yml
new file mode 100644
index 000000000000..db79dd44acb7
--- /dev/null
+++ b/.github/workflows/discord_issues.yml
@@ -0,0 +1,30 @@
+name: Discuss "help-wanted" issue on Discord
+
+on:
+  issues:
+    types:
+      - labeled
+  workflow_dispatch:
+    inputs:
+      issue_number:
+        description: 'Issue number'
+        required: true
+
+permissions:
+  issues: write
+
+jobs:
+  discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Discuss on Discord-Issues"
+        if: ${{ github.event.label.name == 'help wanted' }}
+        uses: EndBug/discuss-on-discord@v1.1.0
+        with:
+          discord_bot_token: ${{ secrets.DISCORD_BOT_TOKEN }}
+          destination: ${{ secrets.DISCORD_BOT_DESTINATION }}
+          issue_number: ${{ github.event.inputs.issue_number || github.event.issue.number }}
+          issue_comment: Hey 👋, I've just created a [thread]($THREAD_LINK$) for this issue on [PyTorch-Ignite Discord](https://pytorch-ignite.ai/chat) where you can quickly talk to the community on the topic.
+          discord_message: New issue created in `${{ github.repository }}`:<https://github.com/${{ github.repository }}/issues/${{ github.event.issue.number }}>
+          
+      
diff --git a/.github/workflows/discord_pull_requests.yaml b/.github/workflows/discord_pull_requests.yaml
new file mode 100644
index 000000000000..121aa581aa48
--- /dev/null
+++ b/.github/workflows/discord_pull_requests.yaml
@@ -0,0 +1,29 @@
+name: Discuss "help-wanted" PR on Discord
+
+on:
+  pull_request:
+    types:
+      - labeled
+  workflow_dispatch:
+    inputs:
+      pull_request_number:
+        description: 'Pull request number'
+        required: true
+
+permissions:
+  pull-requests: write
+
+jobs:
+  discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Discuss on Discord-PR (Non-maintainer only)"
+        if: ${{ github.event.label.name == 'help wanted' }}
+        uses: EndBug/discuss-on-discord@v1.1.0
+        with:
+          discord_bot_token: ${{ secrets.DISCORD_BOT_TOKEN }}
+          destination: ${{ secrets.DISCORD_BOT_DESTINATION }}
+          issue_number: ${{ github.event.inputs.pull_request_number || github.event.pull_request.number }}
+          issue_comment: Hey 👋, I've just created a [thread]($THREAD_LINK$) for this pull request on [PyTorch-Ignite Discord](https://pytorch-ignite.ai/chat) where you can quickly talk to the community on the topic.
+          discord_message: New PR created in `${{ github.repository }}`:<https://github.com/${{ github.repository }}/pull/${{ github.event.pull_request_number || github.event.pull_request.number }}>
+
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 6344d49a751a..37bb2333c81c 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -16,7 +16,7 @@ jobs:
       hvd_version: ${{ steps.set-versions.outputs.hvd_version }}
       msdp_version: ${{ steps.set-versions.outputs.msdp_version }}
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Changed Files Exporter
         id: files
         uses: futuratrepadeira/changed-files@v3.3.0
@@ -41,14 +41,14 @@ jobs:
     if: contains(needs.setup.outputs.modified, 'hvd/') || contains(needs.setup.outputs.modified, 'docker.cfg')
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Remove cache
         run: |
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Build hvd hvd-base
         working-directory: docker
         run: |
@@ -79,14 +79,14 @@ jobs:
     if: contains(needs.setup.outputs.modified, 'hvd/') || contains(needs.setup.outputs.modified, 'docker.cfg')
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Remove cache
         run: |
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Build hvd hvd-apex
         working-directory: docker
         run: |
@@ -117,14 +117,14 @@ jobs:
     if: contains(needs.setup.outputs.modified, 'main/') || contains(needs.setup.outputs.modified, 'docker.cfg')
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Remove cache
         run: |
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Build main base
         working-directory: docker
         run: |
@@ -152,14 +152,14 @@ jobs:
     if: contains(needs.setup.outputs.modified, 'main/') || contains(needs.setup.outputs.modified, 'docker.cfg')
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Remove cache
         run: |
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Build main apex
         working-directory: docker
         run: |
@@ -187,14 +187,14 @@ jobs:
     if: contains(needs.setup.outputs.modified, 'msdp/') || contains(needs.setup.outputs.modified, 'docker.cfg')
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Remove cache
         run: |
           sudo rm -rf "/usr/local/share/boost"
           sudo rm -rf "$AGENT_TOOLSDIRECTORY"
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
       - name: Build msdp msdp-apex
         working-directory: docker
         run: |
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index e472dbbed1cf..fa5375c0ae29 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -14,17 +14,8 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
-      - name: Trigger Circle-CI pipeline
-        env:
-          CIRCLE_TOKEN: ${{ secrets.CIRCLE_TOKEN }}
-        run: |
-          pip install requests
-
-          if [ $GITHUB_EVENT_NAME == 'pull_request' ]; then should_publish_docker_images=false; else should_publish_docker_images=true; fi
-          branch=$GITHUB_REF
-
-          python -u .github/workflows/trigger_circle_ci.py $should_publish_docker_images $branch
+          python-version: "3.10"
+      # TODO
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index a12c4f72eef0..9fd907b5c158 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,10 +17,10 @@ jobs:
     if: (github.ref == 'refs/heads/master' && github.event_name == 'push') || github.event_name == 'release'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
 
       - run: sudo npm install katex -g
       - uses: actions/cache@v3
@@ -48,10 +48,10 @@ jobs:
     runs-on: ubuntu-latest
     timeout-minutes: 10
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
 
       - uses: actions/cache@v3
         with:
@@ -69,10 +69,10 @@ jobs:
     if: github.event_name == 'pull_request' || github.event_name == 'push'
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: "3.10"
 
       - run: sudo npm install katex -g
       - uses: actions/cache@v3
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
new file mode 100644
index 000000000000..6661f46b501b
--- /dev/null
+++ b/.github/workflows/gpu-hvd-tests.yml
@@ -0,0 +1,198 @@
+name: Run HVD-specific unit tests on GPUs
+on:
+  push:
+    paths:
+      - "ignite/**"
+      - "tests/ignite/**"
+      - "tests/run_gpu_tests.sh"
+      - "tests/run_code_style.sh"
+      - "examples/**.py"
+      - "requirements-dev.txt"
+      - ".github/workflows/gpu-hvd-tests.yml"
+  workflow_dispatch:
+
+concurrency:
+  # <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)>
+  group: gpu-hvd-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
+  cancel-in-progress: true
+
+# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
+
+jobs:
+  gpu-hvd-tests:
+    strategy:
+      matrix:
+        pytorch-channel: [pytorch, ]
+      fail-fast: false
+    env:
+      DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
+      REPOSITORY: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    runs-on: linux.8xlarge.nvidia.gpu
+    timeout-minutes: 60
+
+    steps:
+      - name: Clean workspace
+        run: |
+          echo "::group::Cleanup debug output"
+          sudo rm -rfv "${GITHUB_WORKSPACE}"
+          mkdir -p "${GITHUB_WORKSPACE}"
+          echo "::endgroup::"
+
+      - name: Checkout repository (pytorch/test-infra)
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: pytorch/test-infra
+          path: test-infra
+
+      - name: Setup Linux
+        uses: ./test-infra/.github/actions/setup-linux
+
+      - name: Pull docker image
+        uses: ./test-infra/.github/actions/pull-docker-image
+        with:
+          docker-image: ${{ env.DOCKER_IMAGE }}
+
+      - name: Checkout repository (${{ github.repository }})
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
+          path: ${{ github.repository }}
+          fetch-depth: 1
+
+      - name: Start Pytorch container
+        working-directory: ${{ github.repository }}
+        run: |
+          docker run --name pthd --gpus=all --rm \
+            --cap-add=SYS_PTRACE \
+            --detach \
+            --ipc=host \
+            --security-opt seccomp=unconfined \
+            --shm-size=2g \
+            --tty \
+            --ulimit stack=10485760:83886080 \
+            -v $PWD:/work \
+            -w /work \
+            ${DOCKER_IMAGE}
+
+          script=$(cat << EOF
+
+            set -xe
+
+            nvidia-smi
+            ls -alh
+
+            conda --version
+            python --version
+
+          EOF
+          )
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install PyTorch and dependencies
+        continue-on-error: false
+        run: |
+
+          script=$(cat << EOF
+
+          set -xe
+
+          # Install PyTorch
+          if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
+            pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121
+          else
+            pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
+          fi
+
+          python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
+          pip list
+
+          # Install dependencies
+          pip install -r requirements-dev.txt
+          pip install -e .
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install Horovod with NCCL GPU ops
+        run: |
+          script=$(cat << EOF
+
+          set -xe
+
+          # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
+          # and horovod is still using C++14
+          # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
+          # Using a similar hack as described here: 
+          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+          git clone --recursive https://github.com/horovod/horovod.git /horovod
+          cd /horovod
+          sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
+          sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt
+          HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 python setup.py install
+
+          horovodrun --check-build
+          pip list
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Run GPU and CPU Unit HVD Tests
+        run: |
+
+          script=$(cat << EOF
+
+          set -xe
+
+          bash tests/run_gpu_tests.sh 2 hvd
+          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          file: ${{ github.repository }}/coverage.xml
+          flags: gpu-2
+          fail_ci_if_error: false
+
+      - name: Run examples in container
+        continue-on-error: false
+        run: |
+          SCRIPT=$(cat << EOF
+
+          set -xe
+
+          # Install additional example dependencies
+          pip install fire
+
+          # Check training on CIFAR10, run with horovod backend using horovodrun
+          # initial run
+          CI=1 horovodrun -np 2 python -u examples/cifar10/main.py run --backend=horovod --checkpoint_every=200 --stop_iteration=500
+          # resume
+          CI=1 horovodrun -np 2 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
+
+          # Check training on CIFAR10 using spawn
+          # initial run
+          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
+          # resume
+          CI=1 python -u examples/cifar10/main.py run --backend=horovod --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-horovod-2_stop-on-500/training_checkpoint_400.pt
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Teardown Linux
+        if: ${{ always() }}
+        uses: ./test-infra/.github/actions/teardown-linux
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index d5a6e8b0bd57..92345b3baed3 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -16,95 +16,164 @@ concurrency:
   group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
   cancel-in-progress: true
 
+# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
+
 jobs:
   gpu-tests:
-    runs-on: [self-hosted, 2-gpus]
-    timeout-minutes: 45
-    defaults:
-      run:
-        shell: bash
     strategy:
-      max-parallel: 1
-      fail-fast: true
       matrix:
         pytorch-channel: [pytorch, pytorch-nightly]
+      fail-fast: false
     env:
-      AGENT_TOOLSDIRECTORY: /tmp/python
+      DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
+      REPOSITORY: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    runs-on: linux.8xlarge.nvidia.gpu
+    timeout-minutes: 45
 
     steps:
-      - uses: actions/checkout@v3
-
-      - name: Clean python tool path
+      - name: Clean workspace
         run: |
-          rm -rf ${AGENT_TOOLSDIRECTORY}
+          echo "::group::Cleanup debug output"
+          sudo rm -rfv "${GITHUB_WORKSPACE}"
+          mkdir -p "${GITHUB_WORKSPACE}"
+          echo "::endgroup::"
+
+      - name: Checkout repository (pytorch/test-infra)
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: pytorch/test-infra
+          path: test-infra
+
+      - name: Setup Linux
+        uses: ./test-infra/.github/actions/setup-linux
 
-      - uses: actions/setup-python@v4
+      - name: Pull docker image
+        uses: ./test-infra/.github/actions/pull-docker-image
         with:
-          python-version: 3.9
+          docker-image: ${{ env.DOCKER_IMAGE }}
 
-      - name: Install PyTorch
-        # https://pytorch.org/get-started/locally/
-        if: ${{ matrix.pytorch-channel == 'pytorch' }}
+      - name: Checkout repository (${{ github.repository }})
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
+          path: ${{ github.repository }}
+          fetch-depth: 1
+
+      - name: Start Pytorch container
+        working-directory: ${{ github.repository }}
         run: |
-          pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
-          nvidia-smi
-          python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
-          pip list
+          docker run --name pthd --gpus=all --rm \
+            --cap-add=SYS_PTRACE \
+            --detach \
+            --ipc=host \
+            --security-opt seccomp=unconfined \
+            --shm-size=2g \
+            --tty \
+            --ulimit stack=10485760:83886080 \
+            -v $PWD:/work \
+            -w /work \
+            ${DOCKER_IMAGE}
+
+          script=$(cat << EOF
+
+            set -xe
+
+            nvidia-smi
+            ls -alh
+
+            conda --version
+            python --version
 
-      - name: Install PyTorch (nightly)
-        # https://pytorch.org/get-started/locally/
-        if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }}
+          EOF
+          )
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install PyTorch and dependencies
+        continue-on-error: false
         run: |
-          pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
-          nvidia-smi
+
+          script=$(cat << EOF
+
+          set -xe
+
+          # Install PyTorch
+          if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
+            pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121
+          else
+            pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
+          fi
+
           python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
           pip list
 
-      - name: Install dependencies
-        run: |
+          # Install dependencies
           pip install -r requirements-dev.txt
           pip install -e .
 
-      - name: Run 1 Node 2 GPUs Unit Tests
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Run GPU Unit Tests
+        continue-on-error: false
         run: |
+
+          script=$(cat << EOF
+
+          set -xe
+
           bash tests/run_gpu_tests.sh 2
 
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
         with:
-          file: ./coverage.xml
+          file: ${{ github.repository }}/coverage.xml
           flags: gpu-2
           fail_ci_if_error: false
 
-      - name: Install additional example dependencies
-        run: pip install fire
-
-      - name: Check training on cifar10, run without backend
-        run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 python ${example_path}/main.py run --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
-          CI=1 python ${example_path}/main.py run --checkpoint_every=200 --num_epochs=7 ${resume_opt}
-
-      - name: Check training on cifar10, run with NCCL backend using torchrun
+      - name: Run examples in container
+        continue-on-error: false
         run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-          CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 ${resume_opt}
-
-      - name: Check training on cifar10, run with NCCL backend using spawn
-        run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-          CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 ${resume_opt}
+          SCRIPT=$(cat << EOF
+
+          set -xe
+
+          # Install additional example dependencies
+          pip install fire
+
+          # Check training on cifar10, run without backend
+          ## initial run
+          CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
+
+          # Check training on cifar10, run with NCCL backend using torchrun
+          ## initial run
+          CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
+
+          # Check training on cifar10, run with NCCL backend using spawn
+          ## initial run
+          CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Teardown Linux
+        if: ${{ always() }}
+        uses: ./test-infra/.github/actions/teardown-linux
diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
index 628ccfce3230..f483d21f38ee 100644
--- a/.github/workflows/hvd-tests.yml
+++ b/.github/workflows/hvd-tests.yml
@@ -32,7 +32,7 @@ jobs:
         pytorch-channel: [pytorch]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Get year & week number
         id: get-date
diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
index 1dbf6df7786f..1b8672c0a47e 100644
--- a/.github/workflows/pytorch-version-tests.yml
+++ b/.github/workflows/pytorch-version-tests.yml
@@ -10,52 +10,48 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
+    timeout-minutes: 45
     strategy:
       max-parallel: 10
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9, "3.10"]
         pytorch-version:
-          [1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0]
-        exclude:
-          - pytorch-version: 1.4.0
-            python-version: 3.9
-          - pytorch-version: 1.4.0
-            python-version: 3.10
-            
+          [2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1]
+        exclude:            
           - pytorch-version: 1.5.1
             python-version: 3.9
           - pytorch-version: 1.5.1
-            python-version: 3.10
+            python-version: "3.10"
 
           - pytorch-version: 1.6.0
             python-version: 3.9
           - pytorch-version: 1.6.0
-            python-version: 3.10
+            python-version: "3.10"
 
           # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail.
           # https://github.com/pytorch/ignite/issues/2383
           - pytorch-version: 1.7.1
             python-version: 3.9
           - pytorch-version: 1.7.1
-            python-version: 3.10
+            python-version: "3.10"
 
           - pytorch-version: 1.8.1
             python-version: 3.9
           - pytorch-version: 1.8.1
-            python-version: 3.10
+            python-version: "3.10"
 
           - pytorch-version: 1.9.1
-            python-version: 3.10
+            python-version: "3.10"
 
           - pytorch-version: 1.10.0
-            python-version: 3.10
+            python-version: "3.10"
 
           - pytorch-version: 1.11.0
-            python-version: 3.10
+            python-version: "3.10"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Get year & week number
         id: get-date
@@ -87,22 +83,18 @@ jobs:
 
       - name: Install dependencies
         shell: bash -l {0}
-        if: ${{ matrix.pytorch-version != '1.4.0' }}
         run: |
           conda install pytorch=${{ matrix.pytorch-version }} torchvision cpuonly python=${{ matrix.python-version }} -c pytorch
           pip install -r requirements-dev.txt
           python setup.py install
 
-      # There is no more torchvision 0.5.0 binaries in anaconda pytorch channel:
-      # https://anaconda.org/pytorch/torchvision/files 
-      - name: Install appropriate dependencies for PyTorch 1.4.0
-        shell: bash -l {0}
-        if: ${{ matrix.pytorch-version == '1.4.0' }}
-        run: |         
-          conda install pytorch=${{ matrix.pytorch-version }} cpuonly python=${{ matrix.python-version }} -c pytorch
-          pip install torchvision==0.5.0
-          pip install -r requirements-dev.txt
-          python setup.py install
+          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+          # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
+          bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
+          if [ "${bad_pth_version}" == "True" ]; then
+            pip install --upgrade "setuptools<59"
+            python -c "from setuptools import distutils; distutils.version.LooseVersion"
+          fi
 
       - name: Download MNIST
         uses: pytorch-ignite/download-mnist-github-action@master
@@ -120,7 +112,7 @@ jobs:
     needs: build
     if: always() && needs.build.result == 'failure'
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: JasonEtco/create-an-issue@v2
         name: Create issue if pytorch version tests failed
         with:
diff --git a/.github/workflows/stable-release-anaconda.yml b/.github/workflows/stable-release-anaconda.yml
index a3299acf9ca4..817c7f59cca1 100644
--- a/.github/workflows/stable-release-anaconda.yml
+++ b/.github/workflows/stable-release-anaconda.yml
@@ -8,13 +8,13 @@ jobs:
   conda-build-publish:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup Miniconda
         uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"
-          python-version: 3.8
+          python-version: "3.10"
 
       - name: Install dependencies
         shell: bash -l {0}
diff --git a/.github/workflows/stable-release-pypi.yml b/.github/workflows/stable-release-pypi.yml
index 974d4fac6cc8..6f37eca6bde5 100644
--- a/.github/workflows/stable-release-pypi.yml
+++ b/.github/workflows/stable-release-pypi.yml
@@ -8,13 +8,13 @@ jobs:
   build-publish:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Setup Miniconda
         uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"
-          python-version: 3.8
+          python-version: "3.10"
 
       - name: Install dependencies
         shell: bash -l {0}
@@ -31,23 +31,13 @@ jobs:
           twine check dist/*
           TWINE_USERNAME="${{ secrets.PYPI_USER }}" TWINE_PASSWORD="${{ secrets.PYPI_TOKEN }}" twine upload --verbose dist/*
 
-  docker-build-publish:
-    name: Trigger Build and Push Docker images to Docker Hub
-    needs: build-publish
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.8
-      - name: Trigger Circle-CI pipeline
-        env:
-          CIRCLE_TOKEN: ${{ secrets.CIRCLE_TOKEN }}
-        run: |
-          pip install requests
-
-          if [ $GITHUB_EVENT_NAME == 'pull_request' ]; then should_publish_docker_images=false; else should_publish_docker_images=true; fi
-          branch=$GITHUB_REF
+  # docker-build-publish:
+  #   name: Trigger Build and Push Docker images to Docker Hub
+  #   needs: build-publish
+  #   runs-on: ubuntu-latest
 
-          python -u .github/workflows/trigger_circle_ci.py $should_publish_docker_images $branch
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: actions/setup-python@v4
+  #       with:
+  #         python-version: "3.10"
diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
index 221f7f191cf0..6eb9397a772d 100644
--- a/.github/workflows/tpu-tests.yml
+++ b/.github/workflows/tpu-tests.yml
@@ -31,7 +31,7 @@ jobs:
         xla-version: [nightly]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Set up Python 3.8
         uses: actions/setup-python@v4
         with:
diff --git a/.github/workflows/trigger_circle_ci.py b/.github/workflows/trigger_circle_ci.py
deleted file mode 100644
index ff8ce3dddd86..000000000000
--- a/.github/workflows/trigger_circle_ci.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import json
-import os
-import sys
-import time
-
-import requests
-
-
-def assert_result(result, expected_code):
-    if result.status_code != expected_code:
-        raise RuntimeError(f"{result.url}, {result.status_code}: {result.text}")
-
-
-def get_output(result_text, required_keys):
-    output = json.loads(result_text)
-
-    if not all([v in output for v in required_keys]):
-        raise RuntimeError(f"Output does not contain required fields: {required_keys}\n" f"Output is: {output}")
-    return output
-
-
-def trigger_new_pipeline(data, headers):
-    result = requests.post(
-        "https://circleci.com/api/v2/project/gh/pytorch/ignite/pipeline", data=json.dumps(data), headers=headers
-    )
-    assert_result(result, 201)
-    output = get_output(result.text, ["id"])
-    return output["id"]
-
-
-def assert_pipeline_created(pipeline_id, headers):
-    while True:
-        result = requests.get(f"https://circleci.com/api/v2/pipeline/{pipeline_id}", headers=headers)
-        assert_result(result, 200)
-        output = get_output(result.text, ["state", "errors"])
-
-        if output["state"] == "errored":
-            raise RuntimeError(f"Pipeline is errored: {output['errors']}")
-        if output["state"] == "created":
-            break
-        time.sleep(2)
-
-
-def get_workflow_id(pipeline_id, headers):
-
-    while True:
-        result = requests.get(f"https://circleci.com/api/v2/pipeline/{pipeline_id}/workflow", headers=headers)
-        assert_result(result, 200)
-        output = get_output(result.text, ["items"])
-        items = output["items"]
-        if len(items) > 1:
-            raise RuntimeError(f"Incorrect number of workflow ids: {len(items)} != 1\n" f"items: {items}")
-        if len(items) < 1:
-            continue
-        item_0 = items[0]
-        if "id" not in item_0:
-            raise RuntimeError("Workflow info does not contain 'id'\n" f"Info: {item_0}")
-        return item_0["id"]
-
-
-def assert_workflows_successful(pipeline_id, headers):
-
-    workflow_id = get_workflow_id(pipeline_id, headers)
-
-    base_url = "https://app.circleci.com/pipelines/github/pytorch/ignite"
-    url = None
-
-    while True:
-        result = requests.get(f"https://circleci.com/api/v2/workflow/{workflow_id}", headers=headers)
-        assert_result(result, 200)
-        output = get_output(result.text, ["name", "status", "pipeline_number"])
-
-        if url is None:
-            url = f"{base_url}/{output['pipeline_number']}/workflows/{workflow_id}"
-            print(f"Circle CI workflow: {url}")
-
-        if output["status"] in ["error", "failing", "canceled", "not_run", "failed"]:
-            raise RuntimeError(f"Workflow failed: {output['status']}\n" f"See {url}")
-        if output["status"] == "success":
-            print("\nWorkflow successful")
-            break
-        time.sleep(30)
-        print(".", end=" ")
-
-
-if __name__ == "__main__":
-
-    print("Trigger new pipeline on Circle-CI")
-
-    if "CIRCLE_TOKEN" not in os.environ:
-        raise RuntimeError(
-            "Can not find CIRCLE_TOKEN env variable.\nPlease, export CIRCLE_TOKEN=<token> before calling this script."
-            "This token should be a user token and not the project token."
-        )
-        # https://discuss.circleci.com/t/triggering-pipeline-via-v2-api-fails-with-404-project-not-found/39342/2
-
-    argv = sys.argv
-    if len(argv) != 3:
-        raise RuntimeError("Usage: python trigger_circle_ci.py <true or false> <branch-name>")
-
-    should_publish_docker_images = json.loads(argv[1])
-    branch = argv[2]
-
-    print(f"- should_publish_docker_images: {should_publish_docker_images}")
-    print(f"- Branch: {branch}")
-    if branch.startswith("refs/pull") and branch.endswith("/merge"):
-        branch = branch.replace("/merge", "/head")
-        print(f"Replaced /merge -> /head : {branch}")
-
-    headers = {"authorization": "Basic", "content-type": "application/json", "Circle-Token": os.environ["CIRCLE_TOKEN"]}
-
-    data = {
-        "branch": branch,
-        "parameters": {
-            "should_build_docker_images": True,
-            "should_publish_docker_images": should_publish_docker_images,
-        },
-    }
-
-    unique_pipeline_id = trigger_new_pipeline(data, headers)
-    assert_pipeline_created(unique_pipeline_id, headers)
-    assert_workflows_successful(unique_pipeline_id, headers)
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 2c409f7227a4..23ac6b42c9c8 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -55,7 +55,7 @@ jobs:
             skip-distrib-tests: 1
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 06a6073ad3d2..36bed32e9326 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,15 +15,15 @@ repos:
         exclude_types: ["python", "jupyter", "shell", "gitignore"]
 
   - repo: https://github.com/omnilib/ufmt
-    rev: v1.3.1
+    rev: v2.2.0
     hooks:
       - id: ufmt
         additional_dependencies:
-          - black == 21.12b0
-          - usort == 1.0.1
+          - black == 23.9.1
+          - usort == 1.0.7
 
   - repo: https://github.com/pycqa/flake8
-    rev: 6.0.0
+    rev: 6.1.0
     hooks:
       - id: flake8
         args: ["--config", "setup.cfg"]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6d4bc78a5770..fd41e6abf86d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -57,7 +57,7 @@ into the following categories:
 
 <details>
 
-- Install [miniconda](https://docs.conda.io/projects/continuumio-conda/en/latest/user-guide/install/index.html) for your system.
+- Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) for your system.
 - Create an isolated conda environment for pytorch-ignite:
 
 ```bash
@@ -244,13 +244,12 @@ If you are not familiar with creating a Pull Request, here are some guides:
 **NOTE : When sending a PR, please kindly check if the changes are required to run in the CI.**
 
 For example, typo changes in `CONTRIBUTING.md`, `README.md` are not required to run in the CI.
-So, please add `[skip ci]` in the PR title to save the resources. Ignite has setup 3 CIs.
+So, please add `[skip ci]` in the PR title to save the resources. Ignite has setup several CIs.
 
 - GitHub Actions
-- CircleCI
 - Netlify
 
-CircleCI is disabled on forked PR. So, please add
+So, please add
 
 - `[skip actions]` for the changes which are not required to run on GitHub Actions,
 - `[skip netlify]` for the changes which are not required to run on Netlify PR Preview build, or
diff --git a/README.md b/README.md
index ea791b225e7c..58c95cc8d092 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,11 @@
 
 <!-- [![image](https://travis-ci.com/pytorch/ignite.svg?branch=master)](https://travis-ci.com/pytorch/ignite) -->
 
-| ![image](https://img.shields.io/badge/-Tests:-black?style=flat-square) [![image](https://github.com/pytorch/ignite/workflows/Run%20unit%20tests/badge.svg)](https://github.com/pytorch/ignite/actions) [![image](https://img.shields.io/badge/-GPU%20tests-black?style=flat-square)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master)[![image](https://circleci.com/gh/pytorch/ignite.svg?style=svg)](https://app.circleci.com/pipelines/github/pytorch/ignite?branch=master) [![image](https://codecov.io/gh/pytorch/ignite/branch/master/graph/badge.svg)](https://codecov.io/gh/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/ignite/index.html) |
+| ![image](https://img.shields.io/badge/-Tests:-black?style=flat-square) [![image](https://github.com/pytorch/ignite/actions/workflows/unit-tests.yml/badge.svg?branch=master)](https://github.com/pytorch/ignite/actions/workflows/unit-tests.yml) [![image](https://github.com/pytorch/ignite/actions/workflows/gpu-tests.yml/badge.svg)](https://github.com/pytorch/ignite/actions/workflows/gpu-tests.yml) [![image](https://codecov.io/gh/pytorch/ignite/branch/master/graph/badge.svg)](https://codecov.io/gh/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/ignite/index.html) |
 |:---
-| ![image](https://img.shields.io/badge/-Stable%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch/ignite/badges/version.svg)](https://anaconda.org/pytorch/ignite) [![image](https://anaconda.org/pytorch/ignite/badges/downloads.svg)](https://anaconda.org/pytorch/ignite) [![image](https://img.shields.io/badge/dynamic/json.svg?label=PyPI&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pypi.org/project/pytorch-ignite/) [![image](https://pepy.tech/badge/pytorch-ignite)](https://pepy.tech/project/pytorch-ignite) |
+| ![image](https://img.shields.io/badge/-Stable%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch/ignite/badges/version.svg)](https://anaconda.org/pytorch/ignite) ・ [![image](https://img.shields.io/badge/dynamic/json.svg?label=PyPI&url=https%3A%2F%2Fpypi.org%2Fpypi%2Fpytorch-ignite%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pypi.org/project/pytorch-ignite/) [![image](https://static.pepy.tech/badge/pytorch-ignite)](https://pepy.tech/project/pytorch-ignite) ・ [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) |
 | ![image](https://img.shields.io/badge/-Nightly%20Releases:-black?style=flat-square) [![image](https://anaconda.org/pytorch-nightly/ignite/badges/version.svg)](https://anaconda.org/pytorch-nightly/ignite) [![image](https://img.shields.io/badge/PyPI-pre%20releases-brightgreen)](https://pypi.org/project/pytorch-ignite/#history)|
-| ![image](https://img.shields.io/badge/-Features:-black?style=flat-square) [![image](https://img.shields.io/badge/docker-hub-blue)](https://hub.docker.com/u/pytorchignite) [![image](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org) [![image](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) |
-| ![image](https://img.shields.io/badge/-Community:-black?style=flat-square) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite) [![facebook](https://img.shields.io/badge/news-facebook-blue)](https://www.facebook.com/PyTorch-Ignite-Community-105837321694508) [![numfocus](https://img.shields.io/badge/NumFOCUS-affiliated%20project-green)](https://numfocus.org/sponsored-projects/affiliated-projects) [![discord](https://img.shields.io/badge/chat-discord-blue?logo=discord)](https://discord.gg/djZtm3EmKj) |
+| ![image](https://img.shields.io/badge/-Community:-black?style=flat-square) [![Twitter](https://img.shields.io/badge/news-twitter-blue)](https://twitter.com/pytorch_ignite) [![discord](https://img.shields.io/badge/chat-discord-blue?logo=discord)](https://discord.gg/djZtm3EmKj) [![numfocus](https://img.shields.io/badge/NumFOCUS-affiliated%20project-green)](https://numfocus.org/sponsored-projects/affiliated-projects) |
 | ![image](https://img.shields.io/badge/-Supported_PyTorch/Python_versions:-black?style=flat-square) [![link](https://img.shields.io/badge/-check_here-blue)](https://github.com/pytorch/ignite/actions?query=workflow%3A%22PyTorch+version+tests%22)|
 
 </div>
@@ -346,8 +345,8 @@ For more details, see [here](docker).
 
 Few pointers to get you started:
 
-- [Quick Start Guide: Essentials of getting a project up and running](https://pytorch.org/ignite/quickstart.html)
-- [Concepts of the library: Engine, Events & Handlers, State, Metrics](https://pytorch.org/ignite/concepts.html)
+- [Quick Start Guide: Essentials of getting a project up and running](https://pytorch-ignite.ai/tutorials/beginner/01-getting-started/)
+- [Concepts of the library: Engine, Events & Handlers, State, Metrics](https://pytorch-ignite.ai/concepts/)
 - Full-featured template examples (coming soon)
 
 <!-- ############################################################################################################### -->
@@ -398,7 +397,7 @@ Few pointers to get you started:
   torch.cuda.amp vs nvidia/apex](https://github.com/pytorch/ignite/blob/master/examples/notebooks/Cifar100_bench_amp.ipynb)
 - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb) [MNIST training on a single
   TPU](https://github.com/pytorch/ignite/blob/master/examples/notebooks/MNIST_on_TPU.ipynb)
-- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E9zJrptnLJ_PKhmaP5Vhb6DTVRvyrKHx) [CIFAR10 Training on multiple TPUs](https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10)
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1E9zJrptnLJ_PKhmaP5Vhb6DTVRvyrKHx) [CIFAR10 Training on multiple TPUs](https://github.com/pytorch/ignite/tree/master/examples/cifar10)
 - [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/HandlersTimeProfiler_MNIST.ipynb) [Basic example of handlers
   time profiling on MNIST training example](https://github.com/pytorch/ignite/blob/master/examples/notebooks/HandlersTimeProfiler_MNIST.ipynb)
 
diff --git a/assets/tldr/teaser.ipynb b/assets/tldr/teaser.ipynb
index f96ba8311b41..a4ac1bfed6db 100644
--- a/assets/tldr/teaser.ipynb
+++ b/assets/tldr/teaser.ipynb
@@ -454,7 +454,7 @@
    "source": [
     "### Other links\n",
     "\n",
-    "- Full featured CIFAR10 example: https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10\n"
+    "- Full featured CIFAR10 example: https://github.com/pytorch/ignite/tree/master/examples/cifar10\n"
    ]
   },
   {
diff --git a/assets/tldr/teaser.py b/assets/tldr/teaser.py
index 8d8b59d45ee2..671f22c81af3 100644
--- a/assets/tldr/teaser.py
+++ b/assets/tldr/teaser.py
@@ -220,4 +220,4 @@ def evaluate_model():
 
 
 # Full featured CIFAR10 example:
-# https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10
+# https://github.com/pytorch/ignite/tree/master/examples/cifar10
diff --git a/docker/docker.cfg b/docker/docker.cfg
index a5fcf2f5127a..1c98e0b6aee6 100644
--- a/docker/docker.cfg
+++ b/docker/docker.cfg
@@ -1,4 +1,4 @@
 [DEFAULT]
-build_docker_image_pytorch_version = 1.13.1-cuda11.6-cudnn8
+build_docker_image_pytorch_version = 2.0.0-cuda11.7-cudnn8
 build_docker_image_hvd_version = v0.27.0
 build_docker_image_msdp_version = v0.8.1
diff --git a/docker/test_image.py b/docker/test_image.py
index 2be554079794..88033742f37a 100644
--- a/docker/test_image.py
+++ b/docker/test_image.py
@@ -21,7 +21,6 @@
 
 
 def run_python_cmd(cmd):
-
     try_except_cmd = f"""
 import warnings
 warnings.filterwarnings("ignore")
@@ -65,7 +64,6 @@ def main():
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser("Check docker image script")
     parser.add_argument("image", type=str, help="Docker image to check")
     args = parser.parse_args()
diff --git a/docs/Makefile b/docs/Makefile
index eedf03332d70..3d1f9ada6a8b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -13,7 +13,7 @@ help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
 docset: html
-	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url http://pytorch.org/ignite/ --force $(BUILDDIR)/html/
+	doc2dash --name $(SPHINXPROJ) --icon $(SOURCEDIR)/_static/img/pytorch-logo-flame.png --enable-js --online-redirect-url https://pytorch.org/ignite/ --force $(BUILDDIR)/html/
 
 	# Manually fix because Zeal doesn't deal well with `icon.png`-only at 2x resolution.
 	cp $(SPHINXPROJ).docset/icon.png $(SPHINXPROJ).docset/icon@2x.png
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 874e5773ba39..9a88587a0ee2 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,4 @@
-sphinx==5.0.0
+sphinx<6
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinxcontrib-katex
 sphinx-copybutton==0.4.0
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index 3378ba793509..750930d09fdc 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -516,7 +516,7 @@ <h2>Resources</h2>
       });
     })
   </script>
-  <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/@docsearch/js@3.3.0/dist/umd/index.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/@docsearch/js@3"></script>
   <script type="text/javascript">
   let VERSION
   if ('{{ version }}'.startsWith('v')) {
@@ -526,12 +526,12 @@ <h2>Resources</h2>
   }
   docsearch({
     container: '#docsearch',
-    appId: 'BH4D9OD16A',
-    apiKey: '19a7a7a75d87608d6f42c722ed1e293f',
-    indexName: 'ignite',
+    appId: '7EWYE1JCT3',
+    apiKey: '841e93e60c16975ba1bd8c7c716eed82',
+    indexName: 'pytorch-ignite',
     placeholder: 'Search PyTorch-Ignite docs',
     searchParameters: {
-      facetFilters: [[`version:${VERSION}`, 'tags:ignite-web']],
+      facetFilters: [`version:${VERSION}`, 'tags:API-reference'],
     }
   });
   </script>
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 74ae5fba71e8..2256d425becf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -144,7 +144,7 @@
         # 'https://fonts.googleapis.com/css?family=Lato',
         # '_static/css/pytorch_theme.css'
         "_static/css/ignite_theme.css",
-        "https://cdn.jsdelivr.net/npm/@docsearch/css@3.3.0/dist/style.min.css",
+        "https://cdn.jsdelivr.net/npm/@docsearch/css@3",
     ],
 }
 
@@ -346,6 +346,12 @@ def run(self):
     ("py:class", "torch.utils.data.dataloader.DataLoader"),
 ]
 
+linkcheck_ignore = [
+    "https://github.com/fossasia/visdom#visdom-arguments-python-only",
+    "https://github.com/pytorch/ignite/tree/master/examples/cifar10#check-resume-training",
+    "https://github.com/pytorch/ignite/tree/master/examples/mnist#training-save--resume",
+]
+
 
 def setup(app):
     app.add_directive("autosummary", AutolistAutosummary, override=True)
diff --git a/docs/source/contrib/handlers.rst b/docs/source/contrib/handlers.rst
index 275b5f271093..1635b7d5bb99 100644
--- a/docs/source/contrib/handlers.rst
+++ b/docs/source/contrib/handlers.rst
@@ -50,15 +50,15 @@ Loggers
 
     Below are a comprehensive list of examples of various loggers.
 
-    * See `tensorboardX mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_tensorboard_logger.py>`_
+    * See `tensorboardX mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_tensorboard_logger.py>`_
       and `CycleGAN and EfficientNet notebooks <https://github.com/pytorch/ignite/tree/master/examples/notebooks>`_ for detailed usage.
 
-    * See `visdom mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_visdom_logger.py>`_ for detailed usage.
+    * See `visdom mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_visdom_logger.py>`_ for detailed usage.
 
-    * See `neptune mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_neptune_logger.py>`_ for detailed usage.
+    * See `neptune mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_neptune_logger.py>`_ for detailed usage.
 
-    * See `tqdm mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_tqdm_logger.py>`_ for detailed usage.
+    * See `tqdm mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_tqdm_logger.py>`_ for detailed usage.
 
-    * See `wandb mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_wandb_logger.py>`_ for detailed usage.
+    * See `wandb mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_wandb_logger.py>`_ for detailed usage.
 
-    * See `clearml mnist example <https://github.com/pytorch/ignite/blob/master/examples/contrib/mnist/mnist_with_clearml_logger.py>`_ for detailed usage.
+    * See `clearml mnist example <https://github.com/pytorch/ignite/blob/master/examples/mnist/mnist_with_clearml_logger.py>`_ for detailed usage.
diff --git a/docs/source/contrib/metrics.rst b/docs/source/contrib/metrics.rst
index a4264131b6bd..eccaf9e78084 100644
--- a/docs/source/contrib/metrics.rst
+++ b/docs/source/contrib/metrics.rst
@@ -29,7 +29,7 @@ Module :mod:`ignite.contrib.metrics.regression` provides implementations of
 metrics useful for regression tasks. Definitions of metrics are based on `Botchkarev 2018`_, page 30 "Appendix 2. Metrics mathematical definitions".
 
 .. _`Botchkarev 2018`:
-        https://arxiv.org/ftp/arxiv/papers/1809/1809.03006.pdf
+        https://arxiv.org/abs/1809.03006
 
 Complete list of metrics:
 
diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst
index a72a33caa373..43089924318c 100644
--- a/docs/source/distributed.rst
+++ b/docs/source/distributed.rst
@@ -56,7 +56,7 @@ in the code. For more details, please, see :class:`~ignite.distributed.launcher.
 :meth:`~ignite.distributed.auto.auto_dataloader`.
 
 Complete example of CIFAR10 training can be found
-`here <https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10>`_.
+`here <https://github.com/pytorch/ignite/tree/master/examples/cifar10>`_.
 
 
 .. _torch.distributed.launch: https://pytorch.org/docs/stable/distributed.html#launch-utility
diff --git a/docs/source/engine.rst b/docs/source/engine.rst
index 48fa9cc576c4..6b865d1849c4 100644
--- a/docs/source/engine.rst
+++ b/docs/source/engine.rst
@@ -69,7 +69,7 @@ Resuming the training
 It is possible to resume the training from a checkpoint and approximately reproduce original run's behaviour.
 Using Ignite, this can be easily done using :class:`~ignite.handlers.checkpoint.Checkpoint` handler. Engine provides two methods
 to serialize and deserialize its internal state :meth:`~ignite.engine.engine.Engine.state_dict` and
-:meth:`~ignite.engine.engine.Engine.load_state_dict`. In addition to serializing model, optimizer, lr scheduler etc user can
+:meth:`~ignite.engine.engine.Engine.load_state_dict`. In addition to serializing model, optimizer, lr scheduler, metrics, etc., user can
 store the trainer and then resume the training. For example:
 
 .. code-block:: python
@@ -82,8 +82,9 @@ store the trainer and then resume the training. For example:
     optimizer = ...
     lr_scheduler = ...
     data_loader = ...
+    metric = ...
 
-    to_save = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler}
+    to_save = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'metric': metric}
     handler = Checkpoint(to_save, DiskSaver('/tmp/training', create_dir=True))
     trainer.add_event_handler(Events.EPOCH_COMPLETED, handler)
     trainer.run(data_loader, max_epochs=100)
@@ -104,8 +105,9 @@ We can then restore the training from the last checkpoint.
     optimizer = ...
     lr_scheduler = ...
     data_loader = ...
+    metric = ...
 
-    to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler}
+    to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer, 'lr_scheduler': lr_scheduler, 'metric': metric}
     checkpoint = torch.load(checkpoint_file)
     Checkpoint.load_objects(to_load=to_load, checkpoint=checkpoint)
 
@@ -117,8 +119,8 @@ from iteration.
 
 Complete examples that resumes the training from a checkpoint can be found here:
 
-- `save/resume MNIST <https://github.com/pytorch/ignite/tree/master/examples/mnist#user-content-training-save--resume>`_
-- `save/resume Distributed CIFAR10 <https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10#user-content-check-resume-training>`_
+- `save/resume MNIST <https://github.com/pytorch/ignite/tree/master/examples/mnist#training-save--resume>`_
+- `save/resume Distributed CIFAR10 <https://github.com/pytorch/ignite/tree/master/examples/cifar10#check-resume-training>`_
 
 Deterministic training
 ----------------------
@@ -213,8 +215,8 @@ We can see that the data samples are exactly the same between original and resum
 Complete examples that simulates a crash on a defined iteration and resumes the training from a checkpoint can be found
 here:
 
-- `save/resume MNIST <https://github.com/pytorch/ignite/tree/master/examples/mnist#user-content-training-save--resume>`_
-- `save/resume Distributed CIFAR10 <https://github.com/pytorch/ignite/tree/master/examples/contrib/cifar10#user-content-check-resume-training>`_
+- `save/resume MNIST <https://github.com/pytorch/ignite/tree/master/examples/mnist#training-save--resume>`_
+- `save/resume Distributed CIFAR10 <https://github.com/pytorch/ignite/tree/master/examples/cifar10#check-resume-training>`_
 
 
 .. Note ::
diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index afc477f457e1..bd5038f08140 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -290,7 +290,10 @@ Complete list of usages
 
     - :class:`~ignite.metrics.metric.MetricUsage`
     - :class:`~ignite.metrics.metric.EpochWise`
+    - :class:`~ignite.metrics.metric.RunningEpochWise`
     - :class:`~ignite.metrics.metric.BatchWise`
+    - :class:`~ignite.metrics.metric.RunningBatchWise`
+    - :class:`~ignite.metrics.metric.SingleEpochRunningBatchWise`
     - :class:`~ignite.metrics.metric.BatchFiltered`
 
 Metrics and distributed computations
@@ -359,10 +362,22 @@ EpochWise
 ~~~~~~~~~
 .. autoclass:: ignite.metrics.metric.EpochWise
 
+RunningEpochWise
+~~~~~~~~~~~~~~~~
+.. autoclass:: ignite.metrics.metric.RunningEpochWise
+
 BatchWise
 ~~~~~~~~~
 .. autoclass:: ignite.metrics.metric.BatchWise
 
+RunningBatchWise
+~~~~~~~~~~~~~~~~
+.. autoclass:: ignite.metrics.metric.RunningBatchWise
+
+SingleEpochRunningBatchWise
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: ignite.metrics.metric.SingleEpochRunningBatchWise
+
 BatchFiltered
 ~~~~~~~~~~~~~
 .. autoclass:: ignite.metrics.metric.BatchFiltered
diff --git a/examples/contrib/cifar10/.gitignore b/examples/cifar10/.gitignore
similarity index 100%
rename from examples/contrib/cifar10/.gitignore
rename to examples/cifar10/.gitignore
diff --git a/examples/contrib/cifar10/README.md b/examples/cifar10/README.md
similarity index 100%
rename from examples/contrib/cifar10/README.md
rename to examples/cifar10/README.md
diff --git a/examples/contrib/cifar10/main.py b/examples/cifar10/main.py
similarity index 95%
rename from examples/contrib/cifar10/main.py
rename to examples/cifar10/main.py
index e62d1b41d018..5696f37aa7cf 100644
--- a/examples/contrib/cifar10/main.py
+++ b/examples/cifar10/main.py
@@ -20,7 +20,6 @@
 
 
 def training(local_rank, config):
-
     rank = idist.get_rank()
     manual_seed(config["seed"] + rank)
     device = idist.device()
@@ -47,11 +46,7 @@ def training(local_rank, config):
             config["cuda device name"] = torch.cuda.get_device_name(local_rank)
 
         if config["with_clearml"]:
-            try:
-                from clearml import Task
-            except ImportError:
-                # Backwards-compatibility for legacy Trains SDK
-                from trains import Task
+            from clearml import Task
 
             task = Task.init("CIFAR10-Training", task_name=output_path.stem)
             task.connect_configuration(config)
@@ -205,22 +200,13 @@ def run(
         raise RuntimeError("The value of with_amp should be False if backend is xla")
 
     with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:
-
         parallel.run(training, config)
 
 
 def get_dataflow(config):
     # - Get train/test datasets
-    if idist.get_local_rank() > 0:
-        # Ensure that only local rank 0 download the dataset
-        # Thus each node will download a copy of the dataset
-        idist.barrier()
-
-    train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"])
-
-    if idist.get_local_rank() == 0:
-        # Ensure that only local rank 0 download the dataset
-        idist.barrier()
+    with idist.one_rank_first(local=True):
+        train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"])
 
     # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
     train_loader = idist.auto_dataloader(
@@ -291,7 +277,6 @@ def log_basic_info(logger, config):
 
 
 def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger):
-
     device = idist.device()
 
     # Setup Ignite trainer:
@@ -307,7 +292,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
     scaler = GradScaler(enabled=with_amp)
 
     def train_step(engine, batch):
-
         x, y = batch[0], batch[1]
 
         if x.device != device:
diff --git a/examples/contrib/cifar10/requirements.txt b/examples/cifar10/requirements.txt
similarity index 100%
rename from examples/contrib/cifar10/requirements.txt
rename to examples/cifar10/requirements.txt
diff --git a/examples/contrib/cifar10/utils.py b/examples/cifar10/utils.py
similarity index 100%
rename from examples/contrib/cifar10/utils.py
rename to examples/cifar10/utils.py
diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_fp32.py b/examples/cifar100_amp_benchmark/benchmark_fp32.py
similarity index 100%
rename from examples/contrib/cifar100_amp_benchmark/benchmark_fp32.py
rename to examples/cifar100_amp_benchmark/benchmark_fp32.py
diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_nvidia_apex.py b/examples/cifar100_amp_benchmark/benchmark_nvidia_apex.py
similarity index 100%
rename from examples/contrib/cifar100_amp_benchmark/benchmark_nvidia_apex.py
rename to examples/cifar100_amp_benchmark/benchmark_nvidia_apex.py
diff --git a/examples/contrib/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py b/examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py
similarity index 100%
rename from examples/contrib/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py
rename to examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py
diff --git a/examples/contrib/cifar100_amp_benchmark/utils.py b/examples/cifar100_amp_benchmark/utils.py
similarity index 100%
rename from examples/contrib/cifar100_amp_benchmark/utils.py
rename to examples/cifar100_amp_benchmark/utils.py
diff --git a/examples/contrib/cifar10_qat/.gitignore b/examples/cifar10_qat/.gitignore
similarity index 100%
rename from examples/contrib/cifar10_qat/.gitignore
rename to examples/cifar10_qat/.gitignore
diff --git a/examples/contrib/cifar10_qat/README.md b/examples/cifar10_qat/README.md
similarity index 100%
rename from examples/contrib/cifar10_qat/README.md
rename to examples/cifar10_qat/README.md
diff --git a/examples/contrib/cifar10_qat/main.py b/examples/cifar10_qat/main.py
similarity index 95%
rename from examples/contrib/cifar10_qat/main.py
rename to examples/cifar10_qat/main.py
index 98b0bb10850a..f70567969525 100644
--- a/examples/contrib/cifar10_qat/main.py
+++ b/examples/cifar10_qat/main.py
@@ -19,7 +19,6 @@
 
 
 def training(local_rank, config):
-
     rank = idist.get_rank()
     manual_seed(config["seed"] + rank)
     device = idist.device()
@@ -43,11 +42,7 @@ def training(local_rank, config):
             config["cuda device name"] = torch.cuda.get_device_name(local_rank)
 
         if config["with_clearml"]:
-            try:
-                from clearml import Task
-            except ImportError:
-                # Backwards-compatibility for legacy Trains SDK
-                from trains import Task
+            from clearml import Task
 
             task = Task.init("CIFAR10-Training", task_name=output_path.stem)
             task.connect_configuration(config)
@@ -189,22 +184,13 @@ def run(
     spawn_kwargs["nproc_per_node"] = nproc_per_node
 
     with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:
-
         parallel.run(training, config)
 
 
 def get_dataflow(config):
     # - Get train/test datasets
-    if idist.get_local_rank() > 0:
-        # Ensure that only local rank 0 download the dataset
-        # Thus each node will download a copy of the dataset
-        idist.barrier()
-
-    train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"])
-
-    if idist.get_local_rank() == 0:
-        # Ensure that only local rank 0 download the dataset
-        idist.barrier()
+    with idist.one_rank_first(local=True):
+        train_dataset, test_dataset = utils.get_train_test_datasets(config["data_path"])
 
     # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
     train_loader = idist.auto_dataloader(
@@ -275,7 +261,6 @@ def log_basic_info(logger, config):
 
 
 def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger):
-
     device = idist.device()
 
     # Setup Ignite trainer:
@@ -291,7 +276,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
     scaler = GradScaler(enabled=with_amp)
 
     def train_step(engine, batch):
-
         x, y = batch[0], batch[1]
 
         if x.device != device:
diff --git a/examples/contrib/cifar10_qat/pact.py b/examples/cifar10_qat/pact.py
similarity index 100%
rename from examples/contrib/cifar10_qat/pact.py
rename to examples/cifar10_qat/pact.py
diff --git a/examples/contrib/cifar10_qat/utils.py b/examples/cifar10_qat/utils.py
similarity index 100%
rename from examples/contrib/cifar10_qat/utils.py
rename to examples/cifar10_qat/utils.py
diff --git a/examples/contrib/mnist/README.md b/examples/contrib/mnist/README.md
deleted file mode 100644
index 5d5955d27a57..000000000000
--- a/examples/contrib/mnist/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Basic MNIST Example with Ignite and `ignite.contrib` module
-
-ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/mnist)
-
-Basic neural network training with Ignite and various built-in loggers from `ignite.contrib`:
-
-- TQDM progress bar
-- Tensorboard
-- Visdom
-
-### Usage:
-
-#### Requirements:
-
-- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
-- [tqdm](https://github.com/tqdm/tqdm/): `pip install tqdm`
-
-#### Logging with TQDM progress bar
-
-Run the example:
-
-```
-python mnist_with_tqdm_logger.py
-```
-
-### Logging with TensorboardX or `torch.utils.tensorboard`
-
-Example with training and validation monitoring using Tensorboard.
-
-#### Requirements:
-
-- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
-- Optionally [TensorboardX](https://github.com/lanpa/tensorboard-pytorch): `pip install tensorboardX`
-- Tensorboard: `pip install tensorboard`
-
-Optionally, user can install `pynvml` package on Python 3 and log GPU information: used memory, utilization.
-
-#### Usage:
-
-Run the example:
-
-```bash
-python mnist_with_tensorboard_logger.py --log_dir=/tmp/tensorboard_logs
-```
-
-Start tensorboard:
-
-```bash
-tensorboard --logdir=/tmp/tensorboard_logs/
-```
-
-### Logging with Visdom
-
-Example with training and validation monitoring using Visdom
-
-#### Requirements:
-
-- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
-- [Visdom](https://github.com/facebookresearch/visdom): `pip install visdom`
-
-#### Usage:
-
-Run the example:
-
-```bash
-python mnist_with_visdom_logger.py
-```
diff --git a/examples/fast_neural_style/neural_style.py b/examples/fast_neural_style/neural_style.py
index 5aa072171e95..9af5f1888ef5 100644
--- a/examples/fast_neural_style/neural_style.py
+++ b/examples/fast_neural_style/neural_style.py
@@ -78,7 +78,6 @@ def train(args):
     running_avgs = OrderedDict()
 
     def step(engine, batch):
-
         x, _ = batch
         x = x.to(device)
 
diff --git a/examples/fast_neural_style/utils.py b/examples/fast_neural_style/utils.py
index e992fcc4fade..b3671d55f6c7 100644
--- a/examples/fast_neural_style/utils.py
+++ b/examples/fast_neural_style/utils.py
@@ -4,9 +4,9 @@
 def load_image(filename, size=None, scale=None):
     img = Image.open(filename)
     if size is not None:
-        img = img.resize((size, size), Image.ANTIALIAS)
+        img = img.resize((size, size), Image.LANCZOS)
     elif scale is not None:
-        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
+        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.LANCZOS)
     return img
 
 
diff --git a/examples/fast_neural_style/vgg.py b/examples/fast_neural_style/vgg.py
index 2e9ad1f2d135..05950246f0ff 100644
--- a/examples/fast_neural_style/vgg.py
+++ b/examples/fast_neural_style/vgg.py
@@ -2,12 +2,13 @@
 
 import torch
 from torchvision import models
+from torchvision.models.vgg import VGG16_Weights
 
 
 class Vgg16(torch.nn.Module):
     def __init__(self, requires_grad=False):
         super(Vgg16, self).__init__()
-        vgg_pretrained_features = models.vgg16(pretrained=True).features
+        vgg_pretrained_features = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1).features
         self.slice1 = torch.nn.Sequential()
         self.slice2 = torch.nn.Sequential()
         self.slice3 = torch.nn.Sequential()
diff --git a/examples/gan/dcgan.py b/examples/gan/dcgan.py
index cbf804c7c97c..ce9d3d325160 100644
--- a/examples/gan/dcgan.py
+++ b/examples/gan/dcgan.py
@@ -207,7 +207,6 @@ def main(
     alpha,
     output_dir,
 ):
-
     # seed
     check_manual_seed(seed)
 
@@ -243,7 +242,6 @@ def get_noise():
 
     # The main function, processing a batch of examples
     def step(engine, batch):
-
         # unpack the batch. It comes from a dataset, so we have <images, labels> pairs. Discard labels.
         real, _ = batch
         real = real.to(device)
diff --git a/examples/mnist/README.md b/examples/mnist/README.md
index cac50fc14195..3523cd37615a 100644
--- a/examples/mnist/README.md
+++ b/examples/mnist/README.md
@@ -2,7 +2,7 @@
 
 ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/mnist)
 
-#### Requirements:
+#### Minimal requirements:
 
 - [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
 - [tqdm](https://github.com/tqdm/tqdm/): `pip install tqdm`
@@ -11,22 +11,25 @@ ported from [pytorch-examples](https://github.com/pytorch/examples/tree/master/m
 
 Run the example:
 
-```
+```bash
 python mnist.py
 ```
 
+Same example with logging using TQDM progress bar
+
+
+```bash
+python mnist_with_tqdm_logger.py
+```
+
 ### Logging with Tensorboard
 
-MNIST example with training and validation monitoring using Tensorboard. Notice
-that if PyTorch version is less than 1.2, the module TensorboardX is required.
+MNIST example with training and validation monitoring using Tensorboard
 
-#### Requirements:
+#### Additional requirements:
 
-- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
-- [TensorboardX](https://github.com/lanpa/tensorboard-pytorch) (if and only if `PyTorch <= 1.2`): `pip install tensorboardX`
 - Tensorboard: `pip install tensorboard`
 
-#### Usage:
 
 Run the example:
 
@@ -44,9 +47,8 @@ tensorboard --logdir=/tmp/tensorboard_logs/
 
 MNIST example with training and validation monitoring using Visdom
 
-#### Requirements:
+#### Additional requirements:
 
-- [torchvision](https://github.com/pytorch/vision/): `pip install torchvision`
 - [Visdom](https://github.com/facebookresearch/visdom): `pip install visdom`
 
 #### Usage:
@@ -63,6 +65,18 @@ Run the example:
 python mnist_with_visdom.py
 ```
 
+### Logging with ClearML
+
+#### Additional requirements:
+
+- [ClearML python client](https://clear.ml/docs/latest/docs/): `pip install clearml`
+
+#### Usage:
+
+```bash
+python mnist_with_clearml_logger.py
+```
+
 ### Training save & resume
 
 Example shows how to save a checkpoint of the trainer, model, optimizer, lr scheduler.
diff --git a/examples/mnist/mnist_save_resume_engine.py b/examples/mnist/mnist_save_resume_engine.py
index 2fd61cb0c113..4bdacec24f72 100644
--- a/examples/mnist/mnist_save_resume_engine.py
+++ b/examples/mnist/mnist_save_resume_engine.py
@@ -13,7 +13,7 @@
 
 from ignite.engine import create_supervised_evaluator, create_supervised_trainer, Events
 from ignite.handlers import Checkpoint, DiskSaver
-from ignite.metrics import Accuracy, Loss
+from ignite.metrics import Accuracy, Loss, RunningAverage
 from ignite.utils import manual_seed
 
 try:
@@ -162,10 +162,11 @@ def run(
     if deterministic:
         tqdm.write("Setup deterministic trainer")
     trainer = create_supervised_trainer(model, optimizer, criterion, device=device, deterministic=deterministic)
+    running_loss = RunningAverage(output_transform=lambda x: x)
+    running_loss.attach(trainer, "rloss")
 
-    evaluator = create_supervised_evaluator(
-        model, metrics={"accuracy": Accuracy(), "nll": Loss(criterion)}, device=device
-    )
+    metrics = {"accuracy": Accuracy(), "nll": Loss(criterion)}
+    evaluator = create_supervised_evaluator(model, metrics, device)
 
     # Apply learning rate scheduling
     @trainer.on(Events.EPOCH_COMPLETED)
@@ -177,9 +178,10 @@ def lr_step(engine):
     @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
     def log_training_loss(engine):
         lr = optimizer.param_groups[0]["lr"]
-        pbar.desc = f"Epoch {engine.state.epoch} - loss: {engine.state.output:.4f} - lr: {lr:.4f}"
+        rloss = engine.state.metrics["rloss"]
+        pbar.desc = f"Epoch {engine.state.epoch} - loss: {rloss:.4f} - lr: {lr:.4f}"
         pbar.update(log_interval)
-        writer.add_scalar("training/loss", engine.state.output, engine.state.iteration)
+        writer.add_scalar("training/running_loss", rloss, engine.state.iteration)
         writer.add_scalar("lr", lr, engine.state.iteration)
 
     if crash_iteration > 0:
@@ -222,7 +224,14 @@ def log_validation_results(engine):
         writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch)
 
     # Setup object to checkpoint
-    objects_to_checkpoint = {"trainer": trainer, "model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler}
+    objects_to_checkpoint = {
+        "trainer": trainer,
+        "model": model,
+        "optimizer": optimizer,
+        "lr_scheduler": lr_scheduler,
+        "train_running_loss": running_loss,
+        "metrics": metrics,
+    }
     training_checkpoint = Checkpoint(
         to_save=objects_to_checkpoint,
         save_handler=DiskSaver(log_dir, require_empty=False),
diff --git a/examples/contrib/mnist/mnist_with_clearml_logger.py b/examples/mnist/mnist_with_clearml_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_clearml_logger.py
rename to examples/mnist/mnist_with_clearml_logger.py
diff --git a/examples/contrib/mnist/mnist_with_neptune_logger.py b/examples/mnist/mnist_with_neptune_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_neptune_logger.py
rename to examples/mnist/mnist_with_neptune_logger.py
diff --git a/examples/contrib/mnist/mnist_with_tensorboard_logger.py b/examples/mnist/mnist_with_tensorboard_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_tensorboard_logger.py
rename to examples/mnist/mnist_with_tensorboard_logger.py
diff --git a/examples/contrib/mnist/mnist_with_tqdm_logger.py b/examples/mnist/mnist_with_tqdm_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_tqdm_logger.py
rename to examples/mnist/mnist_with_tqdm_logger.py
diff --git a/examples/contrib/mnist/mnist_with_visdom_logger.py b/examples/mnist/mnist_with_visdom_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_visdom_logger.py
rename to examples/mnist/mnist_with_visdom_logger.py
diff --git a/examples/contrib/mnist/mnist_with_wandb_logger.py b/examples/mnist/mnist_with_wandb_logger.py
similarity index 100%
rename from examples/contrib/mnist/mnist_with_wandb_logger.py
rename to examples/mnist/mnist_with_wandb_logger.py
diff --git a/examples/notebooks/Cifar100_bench_amp.ipynb b/examples/notebooks/Cifar100_bench_amp.ipynb
index d7c713b49896..dc9cfc750d93 100644
--- a/examples/notebooks/Cifar100_bench_amp.ipynb
+++ b/examples/notebooks/Cifar100_bench_amp.ipynb
@@ -1,29 +1,4 @@
 {
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  },
-  "colab": {
-   "name": "Cifar100_bench_amp.ipynb",
-   "provenance": []
-  }
- },
  "cells": [
   {
    "cell_type": "markdown",
@@ -63,14 +38,14 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "SkRXPuNRfDHX"
    },
+   "outputs": [],
    "source": [
     "!pip install pytorch-ignite pynvml fire"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -83,32 +58,32 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "fGtxgbj8fDHb"
    },
+   "outputs": [],
    "source": [
     "# Install Apex:\n",
     "# If torch cuda version and nvcc version match:\n",
     "!pip install --upgrade --no-cache-dir --global-option=\"--cpp_ext\" --global-option=\"--cuda_ext\" git+https://github.com/NVIDIA/apex/\n",
     "# if above command is failing, please install apex without c++/cuda extensions:\n",
     "# !pip install --upgrade --no-cache-dir git+https://github.com/NVIDIA/apex/"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "QnihHXQpfDHb"
    },
+   "outputs": [],
    "source": [
     "import torch\n",
     "import torchvision\n",
     "import ignite\n",
     "torch.__version__, torchvision.__version__, ignite.__version__"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -121,16 +96,16 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "6xqqj0q1fDHh"
    },
+   "outputs": [],
    "source": [
     "!git clone https://github.com/pytorch/ignite.git /tmp/ignite\n",
-    "scriptspath=\"/tmp/ignite/examples/contrib/cifar100_amp_benchmark/\"\n",
+    "scriptspath=\"/tmp/ignite/examples/cifar100_amp_benchmark/\"\n",
     "setup=f\"cd {scriptspath} && export PYTHONPATH=$PWD:$PYTHONPATH\""
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -143,15 +118,15 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "ulufk4tsfDHj"
    },
+   "outputs": [],
    "source": [
     "from torchvision.datasets.cifar import CIFAR100\n",
     "CIFAR100(root=\"/tmp/cifar100/\", train=True, download=True)"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -164,14 +139,14 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "mHwsVTB6fDHq"
    },
+   "outputs": [],
    "source": [
     "!{setup} && python benchmark_fp32.py /tmp/cifar100/ --batch_size=256 --max_epochs=20"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -184,14 +159,14 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "xkuW1EY-fDHs"
    },
+   "outputs": [],
    "source": [
     "!{setup} && python benchmark_torch_cuda_amp.py /tmp/cifar100/ --batch_size=256 --max_epochs=20"
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "markdown",
@@ -209,25 +184,50 @@
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "A6Pe4cW6fDHu"
    },
+   "outputs": [],
    "source": [
     "!{setup} && python benchmark_nvidia_apex.py /tmp/cifar100/ --batch_size=256 --max_epochs=20 --opt=\"O1\""
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {
     "id": "1aqdlPSgfDHu"
    },
+   "outputs": [],
    "source": [
     "!{setup} && python benchmark_nvidia_apex.py /tmp/cifar100/ --batch_size=256 --max_epochs=20 --opt=\"O2\""
-   ],
-   "execution_count": null,
-   "outputs": []
+   ]
   }
- ]
+ ],
+ "metadata": {
+  "colab": {
+   "name": "Cifar100_bench_amp.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/examples/references/classification/imagenet/dataflow.py b/examples/references/classification/imagenet/dataflow.py
index 4d422d9e26a0..e497be3bcceb 100644
--- a/examples/references/classification/imagenet/dataflow.py
+++ b/examples/references/classification/imagenet/dataflow.py
@@ -19,7 +19,6 @@ def opencv_loader(path):
 
 
 def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs):
-
     if limit_num_samples is not None:
         g = torch.Generator().manual_seed(limit_num_samples)
         indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples]
@@ -38,7 +37,6 @@ def get_train_val_loaders(
     limit_train_num_samples: Optional[int] = None,
     limit_val_num_samples: Optional[int] = None,
 ) -> Tuple[DataLoader, DataLoader, DataLoader]:
-
     train_ds = ImageFolder(
         Path(root_path) / "train",
         transform=lambda sample: train_transforms(image=sample)["image"],
diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py
index 8a001260d972..85c20c08a62b 100644
--- a/examples/references/classification/imagenet/main.py
+++ b/examples/references/classification/imagenet/main.py
@@ -24,7 +24,6 @@
 
 
 def training(local_rank, config, logger, with_clearml):
-
     rank = idist.get_rank()
     manual_seed(config.seed + local_rank)
 
@@ -305,7 +304,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True):
     assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found"
 
     with idist.Parallel(backend=backend) as parallel:
-
         logger = setup_logger(name="ImageNet Training", distributed_rank=idist.get_rank())
 
         config = ConfigObject(config_filepath)
@@ -327,7 +325,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True):
 
 
 def get_model_weights(config, logger, with_clearml):
-
     path = ""
     if with_clearml:
         from clearml import Model
@@ -352,7 +349,6 @@ def get_model_weights(config, logger, with_clearml):
 
 
 def evaluation(local_rank, config, logger, with_clearml):
-
     rank = idist.get_rank()
     device = idist.device()
     manual_seed(config.seed + local_rank)
@@ -428,5 +424,4 @@ def run_evaluation(config_filepath, backend="nccl", with_clearml=True):
 
 
 if __name__ == "__main__":
-
     fire.Fire({"training": run_training, "eval": run_evaluation})
diff --git a/examples/references/classification/imagenet/utils.py b/examples/references/classification/imagenet/utils.py
index 799a6069afd4..45be888d0de5 100644
--- a/examples/references/classification/imagenet/utils.py
+++ b/examples/references/classification/imagenet/utils.py
@@ -6,7 +6,6 @@
 
 
 def initialize(config):
-
     device = idist.device()
 
     model = config.model.to(device)
diff --git a/examples/references/segmentation/pascal_voc2012/dataflow.py b/examples/references/segmentation/pascal_voc2012/dataflow.py
index b3b462f7c64a..befa25b1e4f2 100644
--- a/examples/references/segmentation/pascal_voc2012/dataflow.py
+++ b/examples/references/segmentation/pascal_voc2012/dataflow.py
@@ -27,7 +27,6 @@ def __getitem__(self, index):
 
 
 class VOCSegmentationOpencv(VOCSegmentation):
-
     target_names = [
         "background",
         "aeroplane",
@@ -114,7 +113,6 @@ def get_train_noval_sbdataset(root_path, return_meta=False):
 
 
 def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs):
-
     if limit_num_samples is not None:
         g = torch.Generator().manual_seed(limit_num_samples)
         indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples]
@@ -135,7 +133,6 @@ def get_train_val_loaders(
     limit_train_num_samples=None,
     limit_val_num_samples=None,
 ):
-
     train_ds = get_train_dataset(root_path)
     val_ds = get_val_dataset(root_path)
 
diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py
index 257b14dea031..20afebbb7d36 100644
--- a/examples/references/segmentation/pascal_voc2012/main.py
+++ b/examples/references/segmentation/pascal_voc2012/main.py
@@ -49,7 +49,6 @@ def download_datasets(output_path):
 
 
 def training(local_rank, config, logger, with_clearml):
-
     rank = idist.get_rank()
     manual_seed(config.seed + local_rank)
 
@@ -342,7 +341,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True):
     assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found"
 
     with idist.Parallel(backend=backend) as parallel:
-
         logger = setup_logger(name="Pascal-VOC12 Training", distributed_rank=idist.get_rank())
 
         config = ConfigObject(config_filepath)
@@ -364,7 +362,6 @@ def run_training(config_filepath, backend="nccl", with_clearml=True):
 
 
 def get_model_weights(config, logger, with_clearml):
-
     path = ""
     if with_clearml:
         from clearml import Model
@@ -389,7 +386,6 @@ def get_model_weights(config, logger, with_clearml):
 
 
 def evaluation(local_rank, config, logger, with_clearml):
-
     rank = idist.get_rank()
     device = idist.device()
     manual_seed(config.seed + local_rank)
@@ -472,5 +468,4 @@ def run_evaluation(config_filepath, backend="nccl", with_clearml=True):
 
 
 if __name__ == "__main__":
-
     fire.Fire({"download": download_datasets, "training": run_training, "eval": run_evaluation})
diff --git a/examples/references/segmentation/pascal_voc2012/utils.py b/examples/references/segmentation/pascal_voc2012/utils.py
index 799a6069afd4..45be888d0de5 100644
--- a/examples/references/segmentation/pascal_voc2012/utils.py
+++ b/examples/references/segmentation/pascal_voc2012/utils.py
@@ -6,7 +6,6 @@
 
 
 def initialize(config):
-
     device = idist.device()
 
     model = config.model.to(device)
diff --git a/examples/reinforcement_learning/actor_critic.py b/examples/reinforcement_learning/actor_critic.py
index 19c182cb2152..a62cfd72ee6f 100644
--- a/examples/reinforcement_learning/actor_critic.py
+++ b/examples/reinforcement_learning/actor_critic.py
@@ -122,7 +122,6 @@ def finish_episode(policy, optimizer, gamma):
 
 
 def main(env, args):
-
     policy = Policy()
     optimizer = optim.Adam(policy.parameters(), lr=3e-2)
     timesteps = range(10000)
@@ -185,7 +184,6 @@ def should_finish_training():
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(description="Ignite actor-critic example")
     parser.add_argument("--gamma", type=float, default=0.99, metavar="G", help="discount factor (default: 0.99)")
     parser.add_argument("--seed", type=int, default=543, metavar="N", help="random seed (default: 1)")
diff --git a/examples/reinforcement_learning/reinforce.py b/examples/reinforcement_learning/reinforce.py
index 3daabfa16a45..d964b7c6315d 100644
--- a/examples/reinforcement_learning/reinforce.py
+++ b/examples/reinforcement_learning/reinforce.py
@@ -70,7 +70,6 @@ def finish_episode(policy, optimizer, gamma):
 
 
 def main(env, args):
-
     policy = Policy()
     optimizer = optim.Adam(policy.parameters(), lr=1e-2)
     timesteps = range(10000)
@@ -123,7 +122,6 @@ def should_finish_training():
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(description="PyTorch REINFORCE example")
     parser.add_argument("--gamma", type=float, default=0.99, metavar="G", help="discount factor (default: 0.99)")
     parser.add_argument("--seed", type=int, default=543, metavar="N", help="random seed (default: 543)")
diff --git a/examples/siamese_network/siamese_network.py b/examples/siamese_network/siamese_network.py
index d0a1bfb7e3de..bf4be27629a5 100644
--- a/examples/siamese_network/siamese_network.py
+++ b/examples/siamese_network/siamese_network.py
@@ -65,7 +65,6 @@ def forward_once(self, x):
         return output
 
     def forward(self, input1, input2, input3):
-
         # pass the input through resnet
         output1 = self.forward_once(input1)
         output2 = self.forward_once(input2)
@@ -180,7 +179,6 @@ def calculate_loss(input1, input2):
 
 
 def run(args, model, device, optimizer, train_loader, test_loader, lr_scheduler):
-
     # using Triplet Margin Loss
     criterion = nn.TripletMarginLoss(p=2, margin=2.8)
 
diff --git a/examples/super_resolution/main.py b/examples/super_resolution/main.py
index 816d1caea7f2..08199a22e741 100644
--- a/examples/super_resolution/main.py
+++ b/examples/super_resolution/main.py
@@ -73,8 +73,12 @@ def __len__(self):
         return len(self.dataset)
 
 
-trainset = torchvision.datasets.Caltech101(root="./data", download=True)
-testset = torchvision.datasets.Caltech101(root="./data", download=False)
+try:
+    trainset = torchvision.datasets.Caltech101(root="./data", download=True)
+    testset = torchvision.datasets.Caltech101(root="./data", download=False)
+except RuntimeError:
+    print("Dataset download problem, exiting without error code")
+    exit(0)
 
 trainset_sr = SRDataset(trainset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size)
 testset_sr = SRDataset(testset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size)
diff --git a/examples/super_resolution/model.py b/examples/super_resolution/model.py
index 1f80c95d0643..4d2e3cab33ba 100644
--- a/examples/super_resolution/model.py
+++ b/examples/super_resolution/model.py
@@ -10,7 +10,7 @@ def __init__(self, upscale_factor):
         self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2))
         self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1))
         self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1))
-        self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1))
+        self.conv4 = nn.Conv2d(32, upscale_factor**2, (3, 3), (1, 1), (1, 1))
         self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
 
         self._initialize_weights()
diff --git a/examples/contrib/transformers/README.md b/examples/transformers/README.md
similarity index 100%
rename from examples/contrib/transformers/README.md
rename to examples/transformers/README.md
diff --git a/examples/contrib/transformers/dataset.py b/examples/transformers/dataset.py
similarity index 100%
rename from examples/contrib/transformers/dataset.py
rename to examples/transformers/dataset.py
diff --git a/examples/contrib/transformers/main.py b/examples/transformers/main.py
similarity index 98%
rename from examples/contrib/transformers/main.py
rename to examples/transformers/main.py
index 8c60c1fd7a9c..c879812b98df 100644
--- a/examples/contrib/transformers/main.py
+++ b/examples/transformers/main.py
@@ -22,7 +22,6 @@
 
 
 def training(local_rank, config):
-
     rank = idist.get_rank()
     manual_seed(config["seed"] + rank)
     device = idist.device()
@@ -33,7 +32,6 @@ def training(local_rank, config):
 
     output_path = config["output_dir"]
     if rank == 0:
-
         now = datetime.now().strftime("%Y%m%d-%H%M%S")
         folder_name = f"{config['model']}_backend-{idist.backend()}-{idist.get_world_size()}_{now}"
         output_path = Path(output_path) / folder_name
@@ -46,11 +44,7 @@ def training(local_rank, config):
             config["cuda device name"] = torch.cuda.get_device_name(local_rank)
 
         if config["with_clearml"]:
-            try:
-                from clearml import Task
-            except ImportError:
-                # Backwards-compatibility for legacy Trains SDK
-                from trains import Task
+            from clearml import Task
 
             task = Task.init("IMDB-Training", task_name=output_path.stem)
             task.connect_configuration(config)
@@ -207,7 +201,6 @@ def run(
     spawn_kwargs["nproc_per_node"] = nproc_per_node
 
     with idist.Parallel(backend=backend, **spawn_kwargs) as parallel:
-
         parallel.run(training, config)
 
 
@@ -293,7 +286,6 @@ def log_basic_info(logger, config):
 
 
 def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, config, logger):
-
     device = idist.device()
 
     # Setup Ignite trainer:
@@ -309,7 +301,6 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
     scaler = GradScaler(enabled=with_amp)
 
     def train_step(engine, batch):
-
         input_batch = batch[0]
         labels = batch[1].view(-1, 1)
 
diff --git a/examples/contrib/transformers/model.py b/examples/transformers/model.py
similarity index 100%
rename from examples/contrib/transformers/model.py
rename to examples/transformers/model.py
diff --git a/examples/contrib/transformers/requirements.txt b/examples/transformers/requirements.txt
similarity index 100%
rename from examples/contrib/transformers/requirements.txt
rename to examples/transformers/requirements.txt
diff --git a/examples/contrib/transformers/utils.py b/examples/transformers/utils.py
similarity index 100%
rename from examples/contrib/transformers/utils.py
rename to examples/transformers/utils.py
diff --git a/ignite/__init__.py b/ignite/__init__.py
index 0185adce1238..0e6f65ca8b5e 100644
--- a/ignite/__init__.py
+++ b/ignite/__init__.py
@@ -6,4 +6,4 @@
 import ignite.metrics
 import ignite.utils
 
-__version__ = "0.5.0"
+__version__ = "0.4.13"
diff --git a/ignite/base/mixins.py b/ignite/base/mixins.py
index 563ce66066f4..3ecb2922f039 100644
--- a/ignite/base/mixins.py
+++ b/ignite/base/mixins.py
@@ -4,7 +4,6 @@
 
 
 class Serializable:
-
     _state_dict_all_req_keys: Tuple = ()
     _state_dict_one_of_opt_keys: Tuple = ()
 
diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py
index 95e4e09cb3b1..a688d2fb470a 100644
--- a/ignite/contrib/engines/common.py
+++ b/ignite/contrib/engines/common.py
@@ -33,6 +33,7 @@
 from ignite.handlers.checkpoint import BaseSaveHandler
 from ignite.handlers.param_scheduler import ParamScheduler
 from ignite.metrics import RunningAverage
+from ignite.metrics.metric import RunningBatchWise
 from ignite.utils import deprecated
 
 
@@ -48,6 +49,7 @@ def setup_common_training_handlers(
     with_pbars: bool = True,
     with_pbar_on_iters: bool = True,
     log_every_iters: int = 100,
+    device: Optional[Union[str, torch.device]] = None,
     stop_on_nan: bool = True,
     clear_cuda_cache: bool = True,
     save_handler: Optional[Union[Callable, BaseSaveHandler]] = None,
@@ -91,7 +93,10 @@ def setup_common_training_handlers(
             class to use to store ``to_save``. See :class:`~ignite.handlers.checkpoint.Checkpoint` for more details.
             Argument is mutually exclusive with ``output_path``.
         kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`.
+        device: deprecated argument, it will be removed in 0.4.14.
     """
+    if device is not None:
+        warnings.warn("Argument device is unused and deprecated. It will be removed in 0.4.14")
 
     if idist.get_world_size() > 1:
         _setup_common_distrib_training_handlers(
@@ -176,7 +181,6 @@ def _setup_common_training_handlers(
         trainer.add_event_handler(Events.EPOCH_COMPLETED, empty_cuda_cache)
 
     if to_save is not None:
-
         if output_path is None and save_handler is None:
             raise ValueError(
                 "If to_save argument is provided then output_path or save_handler arguments should be also defined"
@@ -210,8 +214,8 @@ def output_transform(x: Any, index: int, name: str) -> Any:
                 )
 
         for i, n in enumerate(output_names):
-            RunningAverage(output_transform=partial(output_transform, index=i, name=n), epoch_bound=False).attach(
-                trainer, n
+            RunningAverage(output_transform=partial(output_transform, index=i, name=n)).attach(
+                trainer, n, usage=RunningBatchWise()
             )
 
     if with_pbars:
@@ -242,7 +246,6 @@ def _setup_common_distrib_training_handlers(
     save_handler: Optional[Union[Callable, BaseSaveHandler]] = None,
     **kwargs: Any,
 ) -> None:
-
     _setup_common_training_handlers(
         trainer,
         to_save=to_save,
@@ -266,7 +269,7 @@ def _setup_common_distrib_training_handlers(
 
         @trainer.on(Events.EPOCH_STARTED)
         def distrib_set_epoch(engine: Engine) -> None:
-            cast(DistributedSampler, train_sampler).set_epoch(engine.state.epoch - 1)
+            train_sampler.set_epoch(engine.state.epoch - 1)
 
 
 def empty_cuda_cache(_: Engine) -> None:
diff --git a/ignite/contrib/handlers/__init__.py b/ignite/contrib/handlers/__init__.py
index 2db80fd2fd9a..0a6fe3edd5cd 100644
--- a/ignite/contrib/handlers/__init__.py
+++ b/ignite/contrib/handlers/__init__.py
@@ -1,4 +1,5 @@
 from ignite.contrib.handlers.clearml_logger import ClearMLLogger
+from ignite.contrib.handlers.custom_events import CustomPeriodicEvent
 from ignite.contrib.handlers.mlflow_logger import MLflowLogger
 from ignite.contrib.handlers.neptune_logger import NeptuneLogger
 from ignite.contrib.handlers.polyaxon_logger import PolyaxonLogger
diff --git a/ignite/contrib/handlers/base_logger.py b/ignite/contrib/handlers/base_logger.py
index 28cd8f64ef4c..1c4133e25508 100644
--- a/ignite/contrib/handlers/base_logger.py
+++ b/ignite/contrib/handlers/base_logger.py
@@ -32,7 +32,6 @@ def __init__(
         tag: Optional[str] = None,
         whitelist: Optional[Union[List[str], Callable[[str, nn.Parameter], bool]]] = None,
     ):
-
         if not isinstance(model, torch.nn.Module):
             raise TypeError(f"Argument model should be of type torch.nn.Module, but given {type(model)}")
 
@@ -41,15 +40,12 @@ def __init__(
 
         weights = {}
         if whitelist is None:
-
             weights = dict(model.named_parameters())
         elif callable(whitelist):
-
             for n, p in model.named_parameters():
                 if whitelist(n, p):
                     weights[n] = p
         else:
-
             for n, p in model.named_parameters():
                 for item in whitelist:
                     if n.startswith(item):
@@ -91,7 +87,6 @@ def __init__(
         global_step_transform: Optional[Callable[[Engine, Union[str, Events]], int]] = None,
         state_attributes: Optional[List[str]] = None,
     ):
-
         if metric_names is not None:
             if not (isinstance(metric_names, list) or (isinstance(metric_names, str) and metric_names == "all")):
                 raise TypeError(
@@ -185,7 +180,6 @@ def __init__(
         tag: Optional[str] = None,
         whitelist: Optional[Union[List[str], Callable[[str, nn.Parameter], bool]]] = None,
     ):
-
         super(BaseWeightsScalarHandler, self).__init__(model, tag=tag, whitelist=whitelist)
 
         if not callable(reduction):
@@ -239,7 +233,6 @@ def attach(
             return RemovableEventHandle(event_name, log_handler, engine)
 
         else:
-
             if event_name not in State.event_to_attr:
                 raise RuntimeError(f"Unknown event name '{event_name}'")
 
diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py
index 1bfbb1abc00a..99d3db640bd0 100644
--- a/ignite/contrib/handlers/clearml_logger.py
+++ b/ignite/contrib/handlers/clearml_logger.py
@@ -325,7 +325,6 @@ def __init__(
         )
 
     def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, ClearMLLogger):
             raise RuntimeError("Handler OutputHandler works only with ClearMLLogger")
 
@@ -481,14 +480,12 @@ def has_bias_in_name(n, p):
     """
 
     def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, ClearMLLogger):
             raise RuntimeError("Handler WeightsScalarHandler works only with ClearMLLogger")
 
         global_step = engine.state.get_event_attrib_value(event_name)
         tag_prefix = f"{self.tag}/" if self.tag else ""
         for name, p in self.weights:
-
             title_name, _, series_name = name.partition(".")
             logger.clearml_logger.report_scalar(
                 title=f"{tag_prefix}weights_{self.reduction.__name__}/{title_name}",
@@ -579,7 +576,6 @@ def __call__(self, engine: Engine, logger: ClearMLLogger, event_name: Union[str,
         global_step = engine.state.get_event_attrib_value(event_name)
         tag_prefix = f"{self.tag}/" if self.tag else ""
         for name, p in self.weights:
-
             title_name, _, series_name = name.partition(".")
 
             logger.grad_helper.add_histogram(
@@ -826,7 +822,6 @@ def __init__(
         *args: Any,
         **kwargs: Any,
     ):
-
         self._setup_check_clearml(logger, output_uri)
 
         if not dirname:
diff --git a/ignite/contrib/handlers/custom_events.py b/ignite/contrib/handlers/custom_events.py
new file mode 100644
index 000000000000..7eaa65863d52
--- /dev/null
+++ b/ignite/contrib/handlers/custom_events.py
@@ -0,0 +1,124 @@
+import warnings
+
+from ignite.engine import EventEnum, Events, State
+
+
+class CustomPeriodicEvent:
+    """DEPRECATED. Use filtered events instead.
+    Handler to define a custom periodic events as a number of elapsed iterations/epochs
+    for an engine.
+
+    When custom periodic event is created and attached to an engine, the following events are fired:
+    1) K iterations is specified:
+    - `Events.ITERATIONS_<K>_STARTED`
+    - `Events.ITERATIONS_<K>_COMPLETED`
+
+    1) K epochs is specified:
+    - `Events.EPOCHS_<K>_STARTED`
+    - `Events.EPOCHS_<K>_COMPLETED`
+
+
+    Examples:
+
+    .. code-block:: python
+
+        from ignite.engine import Engine, Events
+        from ignite.contrib.handlers import CustomPeriodicEvent
+
+        # Let's define an event every 1000 iterations
+        cpe1 = CustomPeriodicEvent(n_iterations=1000)
+        cpe1.attach(trainer)
+
+        # Let's define an event every 10 epochs
+        cpe2 = CustomPeriodicEvent(n_epochs=10)
+        cpe2.attach(trainer)
+
+        @trainer.on(cpe1.Events.ITERATIONS_1000_COMPLETED)
+        def on_every_1000_iterations(engine):
+            # run a computation after 1000 iterations
+            # ...
+            print(engine.state.iterations_1000)
+
+        @trainer.on(cpe2.Events.EPOCHS_10_STARTED)
+        def on_every_10_epochs(engine):
+            # run a computation every 10 epochs
+            # ...
+            print(engine.state.epochs_10)
+
+
+    Args:
+        n_iterations (int, optional): number iterations of the custom periodic event
+        n_epochs (int, optional): number iterations of the custom periodic event. Argument is optional, but only one,
+            either n_iterations or n_epochs should defined.
+
+    """
+
+    def __init__(self, n_iterations=None, n_epochs=None):
+        warnings.warn(
+            "CustomPeriodicEvent is deprecated since 0.4.0 and will be removed in 0.4.14. Use filtered events instead.",
+            DeprecationWarning,
+        )
+
+        if n_iterations is not None:
+            if not isinstance(n_iterations, int):
+                raise TypeError("Argument n_iterations should be an integer")
+            if n_iterations < 1:
+                raise ValueError("Argument n_iterations should be positive")
+
+        if n_epochs is not None:
+            if not isinstance(n_epochs, int):
+                raise TypeError("Argument n_epochs should be an integer")
+            if n_epochs < 1:
+                raise ValueError("Argument n_epochs should be positive")
+
+        if (n_iterations is None and n_epochs is None) or (n_iterations and n_epochs):
+            raise ValueError("Either n_iterations or n_epochs should be defined")
+
+        if n_iterations:
+            prefix = "iterations"
+            self.state_attr = "iteration"
+            self.period = n_iterations
+
+        if n_epochs:
+            prefix = "epochs"
+            self.state_attr = "epoch"
+            self.period = n_epochs
+
+        self.custom_state_attr = "{}_{}".format(prefix, self.period)
+        event_name = "{}_{}".format(prefix.upper(), self.period)
+        setattr(
+            self,
+            "Events",
+            EventEnum("Events", " ".join(["{}_STARTED".format(event_name), "{}_COMPLETED".format(event_name)])),
+        )
+
+        # Update State.event_to_attr
+        for e in self.Events:
+            State.event_to_attr[e] = self.custom_state_attr
+
+        # Create aliases
+        self._periodic_event_started = getattr(self.Events, "{}_STARTED".format(event_name))
+        self._periodic_event_completed = getattr(self.Events, "{}_COMPLETED".format(event_name))
+
+    def _on_started(self, engine):
+        setattr(engine.state, self.custom_state_attr, 0)
+
+    def _on_periodic_event_started(self, engine):
+        if getattr(engine.state, self.state_attr) % self.period == 1:
+            setattr(engine.state, self.custom_state_attr, getattr(engine.state, self.custom_state_attr) + 1)
+            engine.fire_event(self._periodic_event_started)
+
+    def _on_periodic_event_completed(self, engine):
+        if getattr(engine.state, self.state_attr) % self.period == 0:
+            engine.fire_event(self._periodic_event_completed)
+
+    def attach(self, engine):
+        engine.register_events(*self.Events)
+
+        engine.add_event_handler(Events.STARTED, self._on_started)
+        engine.add_event_handler(
+            getattr(Events, "{}_STARTED".format(self.state_attr.upper())), self._on_periodic_event_started
+        )
+        engine.add_event_handler(
+            getattr(Events, "{}_COMPLETED".format(self.state_attr.upper())), self._on_periodic_event_completed
+        )
diff --git a/ignite/contrib/handlers/mlflow_logger.py b/ignite/contrib/handlers/mlflow_logger.py
index 72cab7133e49..edd71ee7006a 100644
--- a/ignite/contrib/handlers/mlflow_logger.py
+++ b/ignite/contrib/handlers/mlflow_logger.py
@@ -101,7 +101,6 @@ def __init__(self, tracking_uri: Optional[str] = None):
             self.active_run = mlflow.start_run()
 
     def __getattr__(self, attr: Any) -> Any:
-
         import mlflow
 
         return getattr(mlflow, attr)
@@ -230,7 +229,6 @@ def __init__(
         )
 
     def __call__(self, engine: Engine, logger: MLflowLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, MLflowLogger):
             raise TypeError("Handler 'OutputHandler' works only with MLflowLogger")
 
diff --git a/ignite/contrib/handlers/neptune_logger.py b/ignite/contrib/handlers/neptune_logger.py
index 94e0c00a238e..41e4909dfff7 100644
--- a/ignite/contrib/handlers/neptune_logger.py
+++ b/ignite/contrib/handlers/neptune_logger.py
@@ -327,7 +327,6 @@ def __init__(
         )
 
     def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, NeptuneLogger):
             raise TypeError("Handler OutputHandler works only with NeptuneLogger")
 
@@ -491,7 +490,6 @@ def has_bias_in_name(n, p):
     """
 
     def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, NeptuneLogger):
             raise TypeError("Handler WeightsScalarHandler works only with NeptuneLogger")
 
diff --git a/ignite/contrib/handlers/polyaxon_logger.py b/ignite/contrib/handlers/polyaxon_logger.py
index 8ec45983aa34..2a358cb7da94 100644
--- a/ignite/contrib/handlers/polyaxon_logger.py
+++ b/ignite/contrib/handlers/polyaxon_logger.py
@@ -238,7 +238,6 @@ def __init__(
         )
 
     def __call__(self, engine: Engine, logger: PolyaxonLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, PolyaxonLogger):
             raise RuntimeError("Handler 'OutputHandler' works only with PolyaxonLogger")
 
diff --git a/ignite/contrib/handlers/tensorboard_logger.py b/ignite/contrib/handlers/tensorboard_logger.py
index 1cf8a393b822..531c25cf88c3 100644
--- a/ignite/contrib/handlers/tensorboard_logger.py
+++ b/ignite/contrib/handlers/tensorboard_logger.py
@@ -287,7 +287,6 @@ def __init__(
         )
 
     def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, TensorboardLogger):
             raise RuntimeError("Handler 'OutputHandler' works only with TensorboardLogger")
 
@@ -422,14 +421,12 @@ def has_bias_in_name(n, p):
     """
 
     def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, TensorboardLogger):
             raise RuntimeError("Handler 'WeightsScalarHandler' works only with TensorboardLogger")
 
         global_step = engine.state.get_event_attrib_value(event_name)
         tag_prefix = f"{self.tag}/" if self.tag else ""
         for name, p in self.weights:
-
             name = name.replace(".", "/")
             logger.writer.add_scalar(
                 f"{tag_prefix}weights_{self.reduction.__name__}/{name}",
@@ -509,7 +506,6 @@ def __call__(self, engine: Engine, logger: TensorboardLogger, event_name: Union[
         global_step = engine.state.get_event_attrib_value(event_name)
         tag_prefix = f"{self.tag}/" if self.tag else ""
         for name, p in self.weights:
-
             name = name.replace(".", "/")
             logger.writer.add_histogram(
                 tag=f"{tag_prefix}weights/{name}", values=p.data.cpu().numpy(), global_step=global_step
diff --git a/ignite/contrib/handlers/tqdm_logger.py b/ignite/contrib/handlers/tqdm_logger.py
index fd909745337e..37d79b7c4a0b 100644
--- a/ignite/contrib/handlers/tqdm_logger.py
+++ b/ignite/contrib/handlers/tqdm_logger.py
@@ -90,7 +90,7 @@ class ProgressBar(BaseLogger):
 
 
     Note:
-        When adding attaching the progress bar to an engine, it is recommend that you replace
+        When attaching the progress bar to an engine, it is recommended that you replace
         every print operation in the engine's handlers triggered every iteration with
         ``pbar.log_message`` to guarantee the correct format of the stdout.
 
@@ -122,7 +122,6 @@ def __init__(
         ] = "{desc}[{n_fmt}/{total_fmt}] {percentage:3.0f}%|{bar}{postfix} [{elapsed}<{remaining}]",
         **tqdm_kwargs: Any,
     ):
-
         try:
             from tqdm.autonotebook import tqdm
         except ImportError:
@@ -281,7 +280,6 @@ def get_max_number_events(event_name: Union[str, Events, CallableEventWithFilter
         return 1
 
     def __call__(self, engine: Engine, logger: ProgressBar, event_name: Union[str, Events]) -> None:
-
         pbar_total = self.get_max_number_events(event_name, engine)
         if logger.pbar is None:
             logger._reset(pbar_total=pbar_total)
diff --git a/ignite/contrib/handlers/visdom_logger.py b/ignite/contrib/handlers/visdom_logger.py
index ee2408cb54fe..31a09d8f782c 100644
--- a/ignite/contrib/handlers/visdom_logger.py
+++ b/ignite/contrib/handlers/visdom_logger.py
@@ -43,7 +43,7 @@ class VisdomLogger(BaseLogger):
             visdom server. Default, `num_workers=1`. If `num_workers=0` and logger uses the main thread. If using
             Python 2.7 and `num_workers>0` the package `futures` should be installed: `pip install futures`
         kwargs: kwargs to pass into
-            `visdom.Visdom <https://github.com/fossasia/visdom#user-content-visdom-arguments-python-only>`_.
+            `visdom.Visdom <https://github.com/fossasia/visdom#visdom-arguments-python-only>`_.
 
     Note:
         We can also specify username/password using environment variables: VISDOM_USERNAME, VISDOM_PASSWORD
@@ -364,7 +364,6 @@ def __init__(
         _BaseVisDrawer.__init__(self, show_legend=show_legend)
 
     def __call__(self, engine: Engine, logger: VisdomLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, VisdomLogger):
             raise RuntimeError("Handler 'OutputHandler' works only with VisdomLogger")
 
@@ -473,7 +472,6 @@ def __init__(
         _BaseVisDrawer.__init__(self, show_legend=show_legend)
 
     def __call__(self, engine: Engine, logger: VisdomLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, VisdomLogger):
             raise RuntimeError("Handler 'WeightsScalarHandler' works only with VisdomLogger")
 
diff --git a/ignite/contrib/handlers/wandb_logger.py b/ignite/contrib/handlers/wandb_logger.py
index 3670c65e8556..49417d1180f9 100644
--- a/ignite/contrib/handlers/wandb_logger.py
+++ b/ignite/contrib/handlers/wandb_logger.py
@@ -278,7 +278,6 @@ def __init__(
         self.sync = sync
 
     def __call__(self, engine: Engine, logger: WandBLogger, event_name: Union[str, Events]) -> None:
-
         if not isinstance(logger, WandBLogger):
             raise RuntimeError(f"Handler '{self.__class__.__name__}' works only with WandBLogger.")
 
diff --git a/ignite/contrib/metrics/average_precision.py b/ignite/contrib/metrics/average_precision.py
index d206034c6ba6..5aae0848ddf3 100644
--- a/ignite/contrib/metrics/average_precision.py
+++ b/ignite/contrib/metrics/average_precision.py
@@ -68,7 +68,6 @@ def __init__(
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
     ):
-
         try:
             from sklearn.metrics import average_precision_score  # noqa: F401
         except ImportError:
diff --git a/ignite/contrib/metrics/cohen_kappa.py b/ignite/contrib/metrics/cohen_kappa.py
index 0cbb492b360b..942a394fb7e4 100644
--- a/ignite/contrib/metrics/cohen_kappa.py
+++ b/ignite/contrib/metrics/cohen_kappa.py
@@ -55,7 +55,6 @@ def __init__(
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
     ):
-
         try:
             from sklearn.metrics import cohen_kappa_score  # noqa: F401
         except ImportError:
diff --git a/ignite/contrib/metrics/regression/canberra_metric.py b/ignite/contrib/metrics/regression/canberra_metric.py
index 0ec011f87696..177e278e5646 100644
--- a/ignite/contrib/metrics/regression/canberra_metric.py
+++ b/ignite/contrib/metrics/regression/canberra_metric.py
@@ -63,6 +63,7 @@ class CanberraMetric(_BaseRegression):
         - Fixed implementation: ``abs`` in denominator.
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors",)
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/fractional_absolute_error.py b/ignite/contrib/metrics/regression/fractional_absolute_error.py
index 80d88bf11d34..17934a133395 100644
--- a/ignite/contrib/metrics/regression/fractional_absolute_error.py
+++ b/ignite/contrib/metrics/regression/fractional_absolute_error.py
@@ -58,6 +58,7 @@ class FractionalAbsoluteError(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/fractional_bias.py b/ignite/contrib/metrics/regression/fractional_bias.py
index d6516a8e1d43..9b6354f71628 100644
--- a/ignite/contrib/metrics/regression/fractional_bias.py
+++ b/ignite/contrib/metrics/regression/fractional_bias.py
@@ -58,6 +58,7 @@ class FractionalBias(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py b/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py
index 50f9a4d70575..ee717b32df32 100644
--- a/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py
+++ b/ignite/contrib/metrics/regression/geometric_mean_absolute_error.py
@@ -58,6 +58,7 @@ class GeometricMeanAbsoluteError(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py b/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py
index 80d8b21d7424..79c377f29161 100644
--- a/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py
+++ b/ignite/contrib/metrics/regression/geometric_mean_relative_absolute_error.py
@@ -69,6 +69,7 @@ class GeometricMeanRelativeAbsoluteError(_BaseRegression):
 
             0.0...
     """
+    _state_dict_all_req_keys = ("_predictions", "_targets")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/manhattan_distance.py b/ignite/contrib/metrics/regression/manhattan_distance.py
index 42c25944aebf..dae7a3acae11 100644
--- a/ignite/contrib/metrics/regression/manhattan_distance.py
+++ b/ignite/contrib/metrics/regression/manhattan_distance.py
@@ -59,6 +59,7 @@ class ManhattanDistance(_BaseRegression):
         - Fixed sklearn compatibility.
         - Workes with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors",)
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/maximum_absolute_error.py b/ignite/contrib/metrics/regression/maximum_absolute_error.py
index 686599689d2a..f9c9a33550e5 100644
--- a/ignite/contrib/metrics/regression/maximum_absolute_error.py
+++ b/ignite/contrib/metrics/regression/maximum_absolute_error.py
@@ -58,6 +58,7 @@ class MaximumAbsoluteError(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_max_of_absolute_errors",)
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/mean_absolute_relative_error.py b/ignite/contrib/metrics/regression/mean_absolute_relative_error.py
index 0d546da146e0..de0cc98c8c69 100644
--- a/ignite/contrib/metrics/regression/mean_absolute_relative_error.py
+++ b/ignite/contrib/metrics/regression/mean_absolute_relative_error.py
@@ -58,6 +58,7 @@ class MeanAbsoluteRelativeError(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_absolute_relative_errors", "_num_samples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/mean_error.py b/ignite/contrib/metrics/regression/mean_error.py
index 50f0b26a56c0..96ed1a058856 100644
--- a/ignite/contrib/metrics/regression/mean_error.py
+++ b/ignite/contrib/metrics/regression/mean_error.py
@@ -55,6 +55,7 @@ class MeanError(_BaseRegression):
 
             0.625...
     """
+    _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/mean_normalized_bias.py b/ignite/contrib/metrics/regression/mean_normalized_bias.py
index 14cebd682968..93f7ef4b9ec8 100644
--- a/ignite/contrib/metrics/regression/mean_normalized_bias.py
+++ b/ignite/contrib/metrics/regression/mean_normalized_bias.py
@@ -58,6 +58,7 @@ class MeanNormalizedBias(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors", "_num_examples")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/median_absolute_error.py b/ignite/contrib/metrics/regression/median_absolute_error.py
index 46988bcc9e0c..d7f376a323bb 100644
--- a/ignite/contrib/metrics/regression/median_absolute_error.py
+++ b/ignite/contrib/metrics/regression/median_absolute_error.py
@@ -67,7 +67,6 @@ class MedianAbsoluteError(EpochMetric):
     def __init__(
         self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
     ):
-
         super(MedianAbsoluteError, self).__init__(
             median_absolute_error_compute_fn, output_transform=output_transform, device=device
         )
diff --git a/ignite/contrib/metrics/regression/r2_score.py b/ignite/contrib/metrics/regression/r2_score.py
index 185afe54cb30..d68cc616f9a0 100644
--- a/ignite/contrib/metrics/regression/r2_score.py
+++ b/ignite/contrib/metrics/regression/r2_score.py
@@ -56,6 +56,7 @@ class R2Score(_BaseRegression):
     .. versionchanged:: 0.4.3
         Works with DDP.
     """
+    _state_dict_all_req_keys = ("_num_examples", "_sum_of_errors", "_y_sq_sum", "_y_sum")
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/regression/wave_hedges_distance.py b/ignite/contrib/metrics/regression/wave_hedges_distance.py
index 175aaf80532a..1e4475a7f17c 100644
--- a/ignite/contrib/metrics/regression/wave_hedges_distance.py
+++ b/ignite/contrib/metrics/regression/wave_hedges_distance.py
@@ -57,6 +57,7 @@ class WaveHedgesDistance(_BaseRegression):
     .. versionchanged:: 0.4.5
         - Works with DDP.
     """
+    _state_dict_all_req_keys = ("_sum_of_errors",)
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/contrib/metrics/roc_auc.py b/ignite/contrib/metrics/roc_auc.py
index b7e86e18b1a8..381e27158614 100644
--- a/ignite/contrib/metrics/roc_auc.py
+++ b/ignite/contrib/metrics/roc_auc.py
@@ -79,7 +79,6 @@ def __init__(
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
     ):
-
         try:
             from sklearn.metrics import roc_auc_score  # noqa: F401
         except ImportError:
@@ -140,7 +139,7 @@ def sigmoid_output_transform(output):
 
             FPR [0.0, 0.333, 0.333, 1.0]
             TPR [0.0, 0.0, 1.0, 1.0]
-            Thresholds [2.0, 1.0, 0.711, 0.047]
+            Thresholds [inf, 1.0, 0.711, 0.047]
 
     ..  versionchanged:: 0.4.11
         added `device` argument
@@ -152,7 +151,6 @@ def __init__(
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
     ) -> None:
-
         try:
             from sklearn.metrics import roc_curve  # noqa: F401
         except ImportError:
diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py
index 9eeacaa2f6d0..70d1950c633f 100644
--- a/ignite/distributed/auto.py
+++ b/ignite/distributed/auto.py
@@ -1,5 +1,5 @@
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Union
+from typing import Any, Iterator, List, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -57,7 +57,8 @@ def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDe
             )
 
     .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
-    .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178
+    .. _XLA MpDeviceLoader:
+        https://pytorch.org/xla/release/2.0/index.html#running-on-multiple-xla-devices-with-multi-processing
     .. _torch DistributedSampler:
         https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler
     .. _torch IterableDataset: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset
@@ -118,7 +119,6 @@ def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDe
     dataloader = DataLoader(dataset, **kwargs)
 
     if idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU and world_size > 1:
-
         logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA")
 
         mp_device_loader_cls = _MpDeviceLoader
@@ -256,7 +256,7 @@ def auto_optim(optimizer: Optimizer, **kwargs: Any) -> Optimizer:
 
             optimizer = idist.auto_optim(optimizer)
 
-    .. _xm.optimizer_step: http://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step
+    .. _xm.optimizer_step: https://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step
 
     .. versionchanged:: 0.4.2
         Added Horovod distributed optimizer.
@@ -294,7 +294,6 @@ class DistributedProxySampler(DistributedSampler):
     """
 
     def __init__(self, sampler: Sampler, num_replicas: Optional[int] = None, rank: Optional[int] = None) -> None:
-
         if not isinstance(sampler, Sampler):
             raise TypeError(f"Argument sampler should be instance of torch Sampler, but given: {type(sampler)}")
 
@@ -329,7 +328,6 @@ def __iter__(self) -> Iterator:
 
 
 if idist.has_xla_support:
-
     import torch_xla.core.xla_model as xm
     from torch_xla.distributed.parallel_loader import ParallelLoader
 
@@ -353,5 +351,5 @@ def __init__(self, optimizer: Optimizer) -> None:
             super(self.__class__, self).__init__(optimizer.param_groups)  # type: ignore[call-arg]
             self.wrapped_optimizer = optimizer
 
-        def step(self, closure: Optional[Callable] = None) -> None:
+        def step(self, closure: Any = None) -> Any:
             xm.optimizer_step(self.wrapped_optimizer, barrier=True)
diff --git a/ignite/distributed/comp_models/__init__.py b/ignite/distributed/comp_models/__init__.py
index 8f35f0b0e228..ef0576e8cdf8 100644
--- a/ignite/distributed/comp_models/__init__.py
+++ b/ignite/distributed/comp_models/__init__.py
@@ -11,9 +11,9 @@
     from ignite.distributed.comp_models.xla import _XlaDistModel
 
 
-def setup_available_computation_models() -> Tuple[
-    Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]], ...
-]:
+def setup_available_computation_models() -> (
+    Tuple[Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]], ...]
+):
     models: List[Type[Union[_SerialModel, "_NativeDistModel", "_XlaDistModel", "_HorovodDistModel"]]] = [
         _SerialModel,
     ]
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
index 7cd4061d5190..00d4383d1ac6 100644
--- a/ignite/distributed/comp_models/base.py
+++ b/ignite/distributed/comp_models/base.py
@@ -136,7 +136,6 @@ def _decode_as_placeholder(encoded_msg: List[int], device: torch.device) -> Unio
     def _setup_placeholder(
         self, x: Union[torch.Tensor, float, str, None], device: torch.device, is_src: bool
     ) -> Union[torch.Tensor, float, str]:
-
         encoded_msg_per_rank = self._encode_input_data(x, is_src)
         encoded_msg_all_ranks = self._do_all_reduce(torch.tensor(encoded_msg_per_rank, device=device), op="MAX")
 
@@ -182,13 +181,16 @@ def _apply_op(
         return tensor
 
     def _collective_op(
-        self, tensor: Union[torch.Tensor, float, str], fn: Callable, *args: Any, **kwargs: Any
+        self, tensor: Union[torch.Tensor, Number, str], fn: Callable, *args: Any, **kwargs: Any
     ) -> Union[torch.Tensor, float, List[float], List[str]]:
         tensor_to_number = tensor_to_str = False
         device = self.device()
         if isinstance(tensor, (Number, float)):
             tensor_to_number = True
-            tensor = torch.tensor(tensor, device=device, dtype=self._collective_op_dtype)
+            dtype = self._collective_op_dtype
+            if dtype is None and isinstance(tensor, float):
+                dtype = torch.double
+            tensor = torch.tensor(tensor, device=device, dtype=dtype)
         elif isinstance(tensor, str):
             tensor_to_str = True
             max_length = self._get_max_length(tensor, device)
@@ -197,10 +199,7 @@ def _collective_op(
         tensor = self._apply_op(tensor, device, fn, *args, **kwargs)
 
         if tensor_to_number:
-            if tensor.numel() == 1:
-                return tensor.item()
-            else:
-                return tensor.tolist()
+            return tensor.tolist()
         elif tensor_to_str:
             return self._decode_str(tensor)
         return tensor
@@ -214,10 +213,10 @@ def all_reduce(
         return cast(Union[torch.Tensor, float], self._collective_op(tensor, self._do_all_reduce, op, group=group))
 
     def all_gather(
-        self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None
-    ) -> Union[torch.Tensor, float, List[float], List[str]]:
+        self, tensor: Union[torch.Tensor, float, str, Any], group: Optional[Any] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]:
         if not isinstance(tensor, (torch.Tensor, Number, str)):
-            raise TypeError(f"Unhandled input type {type(tensor)}")
+            return self._do_all_gather_object(tensor, group=group)
 
         return self._collective_op(tensor, self._do_all_gather, group=group)
 
@@ -280,6 +279,10 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[
     def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
         pass
 
+    @abstractmethod
+    def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]:
+        pass
+
     @abstractmethod
     def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor:
         pass
@@ -352,11 +355,11 @@ def all_reduce(
         return tensor
 
     def all_gather(
-        self, tensor: Union[torch.Tensor, float, str], group: Optional[Any] = None
-    ) -> Union[torch.Tensor, float, List[float], List[str]]:
+        self, tensor: Union[torch.Tensor, float, str, Any], group: Optional[Any] = None
+    ) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]:
         if isinstance(tensor, torch.Tensor):
             return tensor
-        return cast(Union[List[float], List[str]], [tensor])
+        return cast(Union[List[float], List[str], List[Any]], [tensor])
 
     def broadcast(
         self, tensor: Union[torch.Tensor, float, str, None], src: int = 0, safe_mode: bool = False
@@ -371,6 +374,9 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[
     def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
         return tensor
 
+    def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> Any:
+        return tensor
+
     def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:
         return ranks
 
diff --git a/ignite/distributed/comp_models/horovod.py b/ignite/distributed/comp_models/horovod.py
index 3a6226a12bda..36f15f4428db 100644
--- a/ignite/distributed/comp_models/horovod.py
+++ b/ignite/distributed/comp_models/horovod.py
@@ -21,7 +21,6 @@
 
 
 if has_hvd_support:
-
     HOROVOD = "horovod"
 
     class _HorovodDistModel(ComputationModel):
@@ -193,6 +192,12 @@ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> t
                 tensor = tensor.unsqueeze(0)
             return hvd.allgather(tensor)
 
+        def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]:
+            if group is not None:
+                raise NotImplementedError("all_gather with group for horovod is not implemented")
+
+            return hvd.allgather_object(tensor)
+
         def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:
             return hvd.ProcessSet(ranks)
 
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
index e0f63cabbaa8..c71c7d423119 100644
--- a/ignite/distributed/comp_models/native.py
+++ b/ignite/distributed/comp_models/native.py
@@ -15,7 +15,6 @@
 
 
 if has_native_dist_support:
-
     NCCL = dist.Backend.NCCL
     GLOO = dist.Backend.GLOO
     MPI = dist.Backend.MPI
@@ -196,7 +195,6 @@ def _compute_local_rank_via_hostname(self) -> int:
             return local_rank
 
         def _identify_local_rank(self) -> None:
-
             if "SLURM_JOB_ID" in os.environ:
                 os.environ["LOCAL_RANK"] = os.environ["SLURM_LOCALID"]
 
@@ -216,7 +214,6 @@ def _identify_local_rank(self) -> None:
                 self._local_rank = self._compute_local_rank_via_hostname()
 
         def setup_env_vars(self, rank: Optional[int] = None, world_size: Optional[int] = None) -> None:
-
             self._env_backup = os.environ.copy()
 
             if "SLURM_JOB_ID" in os.environ:
@@ -426,6 +423,7 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[
             if group is not None and not isinstance(group, dist.ProcessGroup):
                 raise ValueError("Argument group should be list of int or ProcessGroup")
             reduce_op = self._reduce_op_map[op]
+            # We do if/else here for compatibility with older pytorch versions
             if group is not None:
                 dist.all_reduce(tensor, reduce_op, group=group)
             else:
@@ -433,17 +431,50 @@ def _do_all_reduce(self, tensor: torch.Tensor, op: str = "SUM", group: Optional[
             return tensor
 
         def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> torch.Tensor:
-            if group is not None and not isinstance(group, dist.ProcessGroup):
+            if group == dist.GroupMember.NON_GROUP_MEMBER:
+                return tensor
+
+            if group is None:
+                group_size = self.get_world_size()
+            elif isinstance(group, dist.ProcessGroup):
+                group_size = group.size()
+            else:
                 raise ValueError("Argument group should be list of int or ProcessGroup")
             if tensor.ndimension() == 0:
                 tensor = tensor.unsqueeze(0)
-            output = [torch.zeros_like(tensor) for _ in range(self.get_world_size())]
+            output = [torch.zeros_like(tensor) for _ in range(group_size)]
+            # We do if/else here for compatibility with older pytorch versions
             if group is not None:
                 dist.all_gather(output, tensor, group=group)
             else:
                 dist.all_gather(output, tensor)
             return torch.cat(output, dim=0)
 
+        def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]:
+            if Version(torch.__version__) < Version("1.7.0"):
+                raise RuntimeError(
+                    "Current torch version does not implement dist.all_gather_object. "
+                    "Required version should be >=1.7.0"
+                )
+
+            if group == dist.GroupMember.NON_GROUP_MEMBER:
+                return tensor
+
+            if group is None:
+                group_size = self.get_world_size()
+            elif isinstance(group, dist.ProcessGroup):
+                group_size = group.size()
+            else:
+                raise ValueError("Argument group should be list of int or ProcessGroup")
+            output = [None for _ in range(group_size)]
+            # We do if/else here for compatibility with older pytorch versions
+            if group is not None:
+                dist.all_gather_object(output, tensor, group=group)
+            else:
+                dist.all_gather_object(output, tensor)
+
+            return output
+
         def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:
             return dist.new_group(ranks=ranks, **kwargs)
 
@@ -475,7 +506,6 @@ def _expand_hostlist(nodelist: str) -> List[str]:
         nodelist = nodelist.replace(" ", "")
 
         for node in re.findall(nodelist_match, nodelist):
-
             node_match = r"(.+)\[((,?[0-9]+-?,?-?){0,})\](.*)?"
 
             match = re.search(node_match, node)
diff --git a/ignite/distributed/comp_models/xla.py b/ignite/distributed/comp_models/xla.py
index c6f6a68a5a1c..eaaeceb02520 100644
--- a/ignite/distributed/comp_models/xla.py
+++ b/ignite/distributed/comp_models/xla.py
@@ -15,7 +15,6 @@
 
 
 if has_xla_support:
-
     XLA_TPU = "xla-tpu"
 
     class _XlaDistModel(ComputationModel):
@@ -156,6 +155,9 @@ def _do_all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None) -> t
             xm.all_reduce("sum", [output], groups=group)
             return output.reshape(-1, *output.shape[2:])
 
+        def _do_all_gather_object(self, tensor: Any, group: Optional[Any] = None) -> List[Any]:
+            raise NotImplementedError("all_gather on object is not implemented for xla")
+
         def _do_new_group(self, ranks: List[int], **kwargs: Any) -> Any:
             return [ranks]
 
diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py
index 0d885b467917..0249a5150d3a 100644
--- a/ignite/distributed/utils.py
+++ b/ignite/distributed/utils.py
@@ -1,4 +1,5 @@
 import socket
+from contextlib import contextmanager
 from functools import wraps
 from typing import Any, Callable, List, Mapping, Optional, Tuple, Union
 
@@ -41,6 +42,7 @@
     "registered_computation_models",
     "one_rank_only",
     "new_group",
+    "one_rank_first",
 ]
 
 _model = _SerialModel()
@@ -303,7 +305,7 @@ def train_fn(local_rank, a, b, c, d=12):
 
     .. _dist.init_process_group: https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group
     .. _mp.start_processes: https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn
-    .. _xmp.spawn: http://pytorch.org/xla/release/1.6/index.html#torch_xla.distributed.xla_multiprocessing.spawn
+    .. _xmp.spawn: https://pytorch.org/xla/release/1.6/index.html#torch_xla.distributed.xla_multiprocessing.spawn
     .. _hvd_run: https://horovod.readthedocs.io/en/latest/api.html#module-horovod.run
 
     .. versionchanged:: 0.4.2
@@ -349,18 +351,20 @@ def all_reduce(
 
 
 def all_gather(
-    tensor: Union[torch.Tensor, float, str], group: Optional[Union[Any, List[int]]] = None
-) -> Union[torch.Tensor, float, List[float], List[str]]:
+    tensor: Union[torch.Tensor, float, str, Any], group: Optional[Union[Any, List[int]]] = None
+) -> Union[torch.Tensor, float, List[float], List[str], List[Any]]:
     """Helper method to perform all gather operation.
 
     Args:
-        tensor: tensor or number or str to collect across participating processes.
+        tensor: tensor or number or str to collect across participating processes. If tensor, it should have the
+            same shape across processes.
         group: list of integer or the process group for each backend. If None, the default process group will be used.
 
     Returns:
-        torch.Tensor of shape ``(world_size * tensor.shape[0], tensor.shape[1], ...)`` if input is a tensor or
-        torch.Tensor of shape ``(world_size, )`` if input is a number or
-        List of strings if input is a string
+        If input is a tensor, returns a torch.Tensor of shape ``(world_size * tensor.shape[0], tensor.shape[1], ...)``.
+        If input is a number, a torch.Tensor of shape ``(world_size, )`` is returned and finally a list of strings
+        is returned if input is a string. If current process does not belong to `group`, the very ``tensor`` is
+        returned.
 
     .. versionchanged:: 0.4.11
         added ``group``
@@ -635,3 +639,44 @@ def wrapper(*args: Any, **kwargs: Any) -> Optional[Any]:
         return wrapper
 
     return _one_rank_only
+
+
+@contextmanager
+def one_rank_first(rank: int = 0, local: bool = False) -> Any:
+    """Context manager that ensures a specific rank runs first before others in a distributed
+    environment.
+
+    Args:
+        rank: rank of the process that should execute the code
+            block inside the context manager first. Default, 0.
+        local: flag to specify local rank or global rank.
+            If True ``rank`` argument will define a local rank to run first.
+            Default, False
+
+    Examples:
+        .. code-block:: python
+
+            def download_dataset():
+                ...
+
+            with idist.one_rank_first():
+                ds = download_dataset()
+
+            dp = ds[0]
+
+    .. versionadded:: 0.4.13
+    """
+
+    current_rank = get_local_rank() if local else get_rank()
+    size = get_nproc_per_node() if local else get_world_size()
+
+    if rank >= size or rank < 0:
+        raise ValueError(f"rank should be between 0 and {size - 1}, but given {rank}")
+
+    if current_rank != rank:
+        barrier()
+
+    yield
+
+    if current_rank == rank:
+        barrier()
diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py
index 299afadba9a2..60d6f7690b2e 100644
--- a/ignite/engine/__init__.py
+++ b/ignite/engine/__init__.py
@@ -44,20 +44,21 @@ def _prepare_batch(
 def supervised_training_step(
     model: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    loss_fn: Union[Callable, torch.nn.Module],
+    loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module],
     device: Optional[Union[str, torch.device]] = None,
     non_blocking: bool = False,
     prepare_batch: Callable = _prepare_batch,
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
     gradient_accumulation_steps: int = 1,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """Factory function for supervised training.
 
     Args:
         model: the model to train.
         optimizer: the optimizer to use.
-        loss_fn: the loss function to use.
+        loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor.
         device: device type specification (default: None).
             Applies to batches after starting the engine. Model *will not* be moved.
             Device can be CPU, GPU.
@@ -71,6 +72,8 @@ def supervised_training_step(
             to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`.
         gradient_accumulation_steps: Number of steps the gradients should be accumulated across.
             (default: 1 (means no gradient accumulation))
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
+
     Returns:
         Callable: update function.
 
@@ -91,6 +94,8 @@ def supervised_training_step(
         Added Gradient Accumulation.
     .. versionchanged:: 0.4.11
         Added `model_transform` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
 
     if gradient_accumulation_steps <= 0:
@@ -104,7 +109,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
             optimizer.zero_grad()
         model.train()
         x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
-        output = model(x)
+        output = model_fn(model, x)
         y_pred = model_transform(output)
         loss = loss_fn(y_pred, y)
         if gradient_accumulation_steps > 1:
@@ -120,7 +125,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
 def supervised_training_step_amp(
     model: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    loss_fn: Union[Callable, torch.nn.Module],
+    loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module],
     device: Optional[Union[str, torch.device]] = None,
     non_blocking: bool = False,
     prepare_batch: Callable = _prepare_batch,
@@ -128,13 +133,14 @@ def supervised_training_step_amp(
     output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
     scaler: Optional["torch.cuda.amp.GradScaler"] = None,
     gradient_accumulation_steps: int = 1,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """Factory function for supervised training using ``torch.cuda.amp``.
 
     Args:
         model: the model to train.
         optimizer: the optimizer to use.
-        loss_fn: the loss function to use.
+        loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor.
         device: device type specification (default: None).
             Applies to batches after starting the engine. Model *will not* be moved.
             Device can be CPU, GPU.
@@ -149,6 +155,7 @@ def supervised_training_step_amp(
         scaler: GradScaler instance for gradient scaling. (default: None)
         gradient_accumulation_steps: Number of steps the gradients should be accumulated across.
             (default: 1 (means no gradient accumulation))
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         Callable: update function
@@ -171,6 +178,8 @@ def supervised_training_step_amp(
         Added Gradient Accumulation.
     .. versionchanged:: 0.4.11
         Added `model_transform` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
 
     try:
@@ -190,7 +199,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
         model.train()
         x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
         with autocast(enabled=True):
-            output = model(x)
+            output = model_fn(model, x)
             y_pred = model_transform(output)
             loss = loss_fn(y_pred, y)
             if gradient_accumulation_steps > 1:
@@ -212,20 +221,21 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
 def supervised_training_step_apex(
     model: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    loss_fn: Union[Callable, torch.nn.Module],
+    loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module],
     device: Optional[Union[str, torch.device]] = None,
     non_blocking: bool = False,
     prepare_batch: Callable = _prepare_batch,
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
     gradient_accumulation_steps: int = 1,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """Factory function for supervised training using apex.
 
     Args:
         model: the model to train.
         optimizer: the optimizer to use.
-        loss_fn: the loss function to use.
+        loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor.
         device: device type specification (default: None).
             Applies to batches after starting the engine. Model *will not* be moved.
             Device can be CPU, GPU.
@@ -239,6 +249,7 @@ def supervised_training_step_apex(
             to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`.
         gradient_accumulation_steps: Number of steps the gradients should be accumulated across.
             (default: 1 (means no gradient accumulation))
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         Callable: update function.
@@ -260,6 +271,8 @@ def supervised_training_step_apex(
         Added Gradient Accumulation.
     .. versionchanged:: 0.4.11
         Added `model_transform` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
 
     try:
@@ -278,7 +291,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
             optimizer.zero_grad()
         model.train()
         x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
-        output = model(x)
+        output = model_fn(model, x)
         y_pred = model_transform(output)
         loss = loss_fn(y_pred, y)
         if gradient_accumulation_steps > 1:
@@ -295,20 +308,21 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
 def supervised_training_step_tpu(
     model: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    loss_fn: Union[Callable, torch.nn.Module],
+    loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module],
     device: Optional[Union[str, torch.device]] = None,
     non_blocking: bool = False,
     prepare_batch: Callable = _prepare_batch,
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
     gradient_accumulation_steps: int = 1,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """Factory function for supervised training using ``torch_xla``.
 
     Args:
         model: the model to train.
         optimizer: the optimizer to use.
-        loss_fn: the loss function to use.
+        loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor.
         device: device type specification (default: None).
             Applies to batches after starting the engine. Model *will not* be moved.
             Device can be CPU, TPU.
@@ -322,6 +336,7 @@ def supervised_training_step_tpu(
             to be assigned to engine's state.output after each iteration. Default is returning `loss.item()`.
         gradient_accumulation_steps: Number of steps the gradients should be accumulated across.
             (default: 1 (means no gradient accumulation))
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         Callable: update function.
@@ -343,6 +358,8 @@ def supervised_training_step_tpu(
        Added Gradient Accumulation argument for all supervised training methods.
     .. versionchanged:: 0.4.11
         Added `model_transform` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
     try:
         import torch_xla.core.xla_model as xm
@@ -360,7 +377,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
             optimizer.zero_grad()
         model.train()
         x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
-        output = model(x)
+        output = model_fn(model, x)
         y_pred = model_transform(output)
         loss = loss_fn(y_pred, y)
         if gradient_accumulation_steps > 1:
@@ -404,7 +421,7 @@ def _check_arg(
 def create_supervised_trainer(
     model: torch.nn.Module,
     optimizer: torch.optim.Optimizer,
-    loss_fn: Union[Callable, torch.nn.Module],
+    loss_fn: Union[Callable[[Any, Any], torch.Tensor], torch.nn.Module],
     device: Optional[Union[str, torch.device]] = None,
     non_blocking: bool = False,
     prepare_batch: Callable = _prepare_batch,
@@ -414,13 +431,14 @@ def create_supervised_trainer(
     amp_mode: Optional[str] = None,
     scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
     gradient_accumulation_steps: int = 1,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Engine:
     """Factory function for creating a trainer for supervised models.
 
     Args:
         model: the model to train.
         optimizer: the optimizer to use.
-        loss_fn: the loss function to use.
+        loss_fn: the loss function that receives `y_pred` and `y`, and returns the loss as a tensor.
         device: device type specification (default: None).
             Applies to batches after starting the engine. Model *will not* be moved.
             Device can be CPU, GPU or TPU.
@@ -444,6 +462,7 @@ def create_supervised_trainer(
             (default: False)
         gradient_accumulation_steps: Number of steps the gradients should be accumulated across.
             (default: 1 (means no gradient accumulation))
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         a trainer engine with supervised update function.
@@ -525,6 +544,8 @@ def output_transform_fn(x, y, y_pred, loss):
         Added Gradient Accumulation argument for all supervised training methods.
     .. versionchanged:: 0.4.11
         Added ``model_transform`` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
 
     device_type = device.type if isinstance(device, torch.device) else device
@@ -543,6 +564,7 @@ def output_transform_fn(x, y, y_pred, loss):
             output_transform,
             _scaler,
             gradient_accumulation_steps,
+            model_fn,
         )
     elif mode == "apex":
         _update = supervised_training_step_apex(
@@ -555,6 +577,7 @@ def output_transform_fn(x, y, y_pred, loss):
             model_transform,
             output_transform,
             gradient_accumulation_steps,
+            model_fn,
         )
     elif mode == "tpu":
         _update = supervised_training_step_tpu(
@@ -567,6 +590,7 @@ def output_transform_fn(x, y, y_pred, loss):
             model_transform,
             output_transform,
             gradient_accumulation_steps,
+            model_fn,
         )
     else:
         _update = supervised_training_step(
@@ -579,6 +603,7 @@ def output_transform_fn(x, y, y_pred, loss):
             model_transform,
             output_transform,
             gradient_accumulation_steps,
+            model_fn,
         )
 
     trainer = Engine(_update) if not deterministic else DeterministicEngine(_update)
@@ -595,6 +620,7 @@ def supervised_evaluation_step(
     prepare_batch: Callable = _prepare_batch,
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y),
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """
     Factory function for supervised evaluation.
@@ -612,6 +638,7 @@ def supervised_evaluation_step(
         output_transform: function that receives 'x', 'y', 'y_pred' and returns value
             to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits
             output expected by metrics. If you change it you should use `output_transform` in metrics.
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         Inference function.
@@ -629,13 +656,15 @@ def supervised_evaluation_step(
     .. versionadded:: 0.4.5
     .. versionchanged:: 0.4.12
         Added ``model_transform`` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
 
     def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]:
         model.eval()
         with torch.no_grad():
             x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
-            output = model(x)
+            output = model_fn(model, x)
             y_pred = model_transform(output)
             return output_transform(x, y, y_pred)
 
@@ -649,6 +678,7 @@ def supervised_evaluation_step_amp(
     prepare_batch: Callable = _prepare_batch,
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y),
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Callable:
     """
     Factory function for supervised evaluation using ``torch.cuda.amp``.
@@ -666,6 +696,7 @@ def supervised_evaluation_step_amp(
         output_transform: function that receives 'x', 'y', 'y_pred' and returns value
             to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits
             output expected by metrics. If you change it you should use `output_transform` in metrics.
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         Inference function.
@@ -683,6 +714,8 @@ def supervised_evaluation_step_amp(
     .. versionadded:: 0.4.5
     .. versionchanged:: 0.4.12
         Added ``model_transform`` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
     try:
         from torch.cuda.amp import autocast
@@ -694,7 +727,7 @@ def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, T
         with torch.no_grad():
             x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
             with autocast(enabled=True):
-                output = model(x)
+                output = model_fn(model, x)
                 y_pred = model_transform(output)
             return output_transform(x, y, y_pred)
 
@@ -710,6 +743,7 @@ def create_supervised_evaluator(
     model_transform: Callable[[Any], Any] = lambda output: output,
     output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y),
     amp_mode: Optional[str] = None,
+    model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
 ) -> Engine:
     """
     Factory function for creating an evaluator for supervised models.
@@ -730,6 +764,7 @@ def create_supervised_evaluator(
             output expected by metrics. If you change it you should use `output_transform` in metrics.
         amp_mode: can be ``amp``, model will be casted to float16 using
             `torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_
+        model_fn: the model function that receives `model` and `x`, and returns `y_pred`.
 
     Returns:
         an evaluator engine with supervised inference function.
@@ -754,6 +789,8 @@ def create_supervised_evaluator(
         Added ``amp_mode`` argument for automatic mixed precision.
     .. versionchanged:: 0.4.12
         Added ``model_transform`` to transform model's output
+    .. versionchanged:: 0.4.13
+        Added `model_fn` to customize model's application on the sample
     """
     device_type = device.type if isinstance(device, torch.device) else device
     on_tpu = "xla" in device_type if device_type is not None else False
@@ -768,6 +805,7 @@ def create_supervised_evaluator(
             prepare_batch=prepare_batch,
             model_transform=model_transform,
             output_transform=output_transform,
+            model_fn=model_fn,
         )
     else:
         evaluate_step = supervised_evaluation_step(
@@ -777,6 +815,7 @@ def create_supervised_evaluator(
             prepare_batch=prepare_batch,
             model_transform=model_transform,
             output_transform=output_transform,
+            model_fn=model_fn,
         )
 
     evaluator = Engine(evaluate_step)
diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py
index 5e7e8c798217..5e4881e6d921 100644
--- a/ignite/engine/engine.py
+++ b/ignite/engine/engine.py
@@ -1,6 +1,5 @@
 import functools
 import logging
-import math
 import time
 import warnings
 import weakref
@@ -40,8 +39,14 @@ class Engine(Serializable):
 
         .. code-block:: python
 
-            def update_model(engine, batch):
-                inputs, targets = batch
+            model = ...
+            model = model.cuda()
+            optimized = ...
+            criterion = ...
+
+            def train_step(engine, batch):
+                model.train()
+                inputs, targets = batch[0].cuda(), batch[1].cuda()
                 optimizer.zero_grad()
                 outputs = model(inputs)
                 loss = criterion(outputs, targets)
@@ -643,7 +648,7 @@ def state_dict_user_keys(self) -> List:
         return self._state_dict_user_keys
 
     def state_dict(self) -> OrderedDict:
-        """Returns a dictionary containing engine's state: "epoch_length", "max_epochs" and "iteration" and
+        """Returns a dictionary containing engine's state: "seed", "epoch_length", "max_epochs" and "iteration" and
         other state values defined by `engine.state_dict_user_keys`
 
         .. code-block:: python
@@ -676,11 +681,11 @@ def save_engine(_):
     def load_state_dict(self, state_dict: Mapping) -> None:
         """Setups engine from `state_dict`.
 
-        State dictionary should contain keys: `iteration` or `epoch`, `max_epochs` and `epoch_length`.
-        If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary.
+        State dictionary should contain keys: `iteration` or `epoch` and `max_epochs`, `epoch_length` and
+        `seed`. If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary.
         Iteration and epoch values are 0-based: the first iteration or epoch is zero.
 
-        This method does not remove any custom attributes added by user.
+        This method does not remove any custom attributs added by user.
 
         Args:
             state_dict: a dict with parameters
@@ -725,14 +730,13 @@ def load_state_dict(self, state_dict: Mapping) -> None:
 
     @staticmethod
     def _is_done(state: State) -> bool:
-        is_done_iters = state.max_iters is not None and state.iteration >= state.max_iters
         is_done_count = (
             state.epoch_length is not None
             and state.max_epochs is not None
             and state.iteration >= state.epoch_length * state.max_epochs
         )
         is_done_epochs = state.max_epochs is not None and state.epoch >= state.max_epochs
-        return is_done_iters or is_done_count or is_done_epochs
+        return is_done_count or is_done_epochs
 
     def set_data(self, data: Union[Iterable, DataLoader]) -> None:
         """Method to set data. After calling the method the next batch passed to `processing_function` is
@@ -774,14 +778,14 @@ def run(
         self,
         data: Optional[Iterable] = None,
         max_epochs: Optional[int] = None,
-        max_iters: Optional[int] = None,
         epoch_length: Optional[int] = None,
+        seed: Optional[int] = None,
     ) -> State:
         """Runs the ``process_function`` over the passed data.
 
         Engine has a state and the following logic is applied in this function:
 
-        - At the first call, new state is defined by `max_epochs`, `max_iters`, `epoch_length`, if provided.
+        - At the first call, new state is defined by `max_epochs`, `epoch_length`, `seed`, if provided.
           A timer for total and per-epoch time is initialized when Events.STARTED is handled.
         - If state is already defined such that there are iterations to run until `max_epochs` and no input arguments
           provided, state is kept and used in the function.
@@ -799,8 +803,7 @@ def run(
                 `len(data)`. If `data` is an iterator and `epoch_length` is not set, then it will be automatically
                 determined as the iteration on which data iterator raises `StopIteration`.
                 This argument should not change if run is resuming from a state.
-            max_iters: Number of iterations to run for.
-                `max_iters` and `max_epochs` are mutually exclusive; only one of the two arguments should be provided.
+            seed: Deprecated argument. Please, use `torch.manual_seed` or :meth:`~ignite.utils.manual_seed`.
 
         Returns:
             State: output state.
@@ -829,6 +832,12 @@ def switch_batch(engine):
                 trainer.run(train_loader, max_epochs=2)
 
         """
+        if seed is not None:
+            warnings.warn(
+                "Argument seed is deprecated. It will be removed in 0.4.14. "
+                "Please, use torch.manual_seed or ignite.utils.manual_seed"
+            )
+
         if data is not None and not isinstance(data, Iterable):
             raise TypeError("Argument data should be iterable")
 
@@ -852,6 +861,8 @@ def switch_batch(engine):
 
         if self.state.max_epochs is None or (self._is_done(self.state) and self._internal_run_generator is None):
             # Create new state
+            if max_epochs is None:
+                max_epochs = 1
             if epoch_length is None:
                 if data is None:
                     raise ValueError("epoch_length should be provided if data is None")
@@ -860,22 +871,9 @@ def switch_batch(engine):
                 if epoch_length is not None and epoch_length < 1:
                     raise ValueError("Input data has zero size. Please provide non-empty data")
 
-            if max_iters is None:
-                if max_epochs is None:
-                    max_epochs = 1
-            else:
-                if max_epochs is not None:
-                    raise ValueError(
-                        "Arguments max_iters and max_epochs are mutually exclusive."
-                        "Please provide only max_epochs or max_iters."
-                    )
-                if epoch_length is not None:
-                    max_epochs = math.ceil(max_iters / epoch_length)
-
             self.state.iteration = 0
             self.state.epoch = 0
             self.state.max_epochs = max_epochs
-            self.state.max_iters = max_iters
             self.state.epoch_length = epoch_length
             # Reset generator if previously used
             self._internal_run_generator = None
@@ -1048,19 +1046,12 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]:
                     if self.state.epoch_length is None:
                         # Define epoch length and stop the epoch
                         self.state.epoch_length = iter_counter
-                        if self.state.max_iters is not None:
-                            self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length)
                         break
 
                     # Should exit while loop if we can not iterate
                     if should_exit:
-                        if not self._is_done(self.state):
-                            total_iters = (
-                                self.state.epoch_length * self.state.max_epochs
-                                if self.state.max_epochs is not None
-                                else self.state.max_iters
-                            )
-
+                        if not self._is_done(self.state) and self.state.max_epochs is not None:
+                            total_iters = self.state.epoch_length * self.state.max_epochs
                             warnings.warn(
                                 "Data iterator can not provide data anymore but required total number of "
                                 "iterations to run is not reached. "
@@ -1087,10 +1078,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]:
                 if self.state.epoch_length is not None and iter_counter == self.state.epoch_length:
                     break
 
-                if self.state.max_iters is not None and self.state.iteration == self.state.max_iters:
-                    self.should_terminate = True
-                    raise _EngineTerminateException()
-
         except _EngineTerminateSingleEpochException:
             self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter)
             self.should_terminate_single_epoch = False
@@ -1206,19 +1193,12 @@ def _run_once_on_dataset_legacy(self) -> float:
                     if self.state.epoch_length is None:
                         # Define epoch length and stop the epoch
                         self.state.epoch_length = iter_counter
-                        if self.state.max_iters is not None:
-                            self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length)
                         break
 
                     # Should exit while loop if we can not iterate
                     if should_exit:
-                        if not self._is_done(self.state):
-                            total_iters = (
-                                self.state.epoch_length * self.state.max_epochs
-                                if self.state.max_epochs is not None
-                                else self.state.max_iters
-                            )
-
+                        if not self._is_done(self.state) and self.state.max_epochs is not None:
+                            total_iters = self.state.epoch_length * self.state.max_epochs
                             warnings.warn(
                                 "Data iterator can not provide data anymore but required total number of "
                                 "iterations to run is not reached. "
@@ -1245,10 +1225,6 @@ def _run_once_on_dataset_legacy(self) -> float:
                 if self.state.epoch_length is not None and iter_counter == self.state.epoch_length:
                     break
 
-                if self.state.max_iters is not None and self.state.iteration == self.state.max_iters:
-                    self.should_terminate = True
-                    raise _EngineTerminateException()
-
         except _EngineTerminateSingleEpochException:
             self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter)
             self.should_terminate_single_epoch = False
diff --git a/ignite/engine/events.py b/ignite/engine/events.py
index 9dd99348492b..c41d79468f34 100644
--- a/ignite/engine/events.py
+++ b/ignite/engine/events.py
@@ -203,6 +203,17 @@ def __or__(self, other: Any) -> "EventsList":
         return EventsList() | self | other
 
 
+class CallableEvents(CallableEventWithFilter):
+    # For backward compatibility
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super(CallableEvents, self).__init__(*args, **kwargs)
+        warnings.warn(
+            "Class ignite.engine.events.CallableEvents is deprecated. It will be removed in 0.4.14. "
+            "Please, use ignite.engine.EventEnum instead",
+            DeprecationWarning,
+        )
+
+
 class EventEnum(CallableEventWithFilter, Enum):
     """Base class for all :class:`~ignite.engine.events.Events`. User defined custom events should also inherit
     this class.
@@ -443,7 +454,6 @@ class State:
         state.dataloader        # data passed to engine
         state.epoch_length      # optional length of an epoch
         state.max_epochs        # number of epochs to run
-        state.max_iters         # number of iterations to run
         state.batch             # batch passed to `process_function`
         state.output            # output of `process_function` after a single iteration
         state.metrics           # dictionary with defined metrics if any
@@ -470,7 +480,6 @@ def __init__(self, **kwargs: Any) -> None:
         self.epoch = 0
         self.epoch_length: Optional[int] = None
         self.max_epochs: Optional[int] = None
-        self.max_iters: Optional[int] = None
         self.output: Optional[int] = None
         self.batch: Optional[int] = None
         self.metrics: Dict[str, Any] = {}
diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py
index f508f0170220..901810516c51 100644
--- a/ignite/handlers/checkpoint.py
+++ b/ignite/handlers/checkpoint.py
@@ -7,7 +7,7 @@
 from abc import ABCMeta, abstractmethod
 from collections import OrderedDict
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Mapping, NamedTuple, Optional, Union
 
 import torch
 import torch.nn as nn
@@ -23,6 +23,7 @@
 import ignite.distributed as idist
 from ignite.base import Serializable
 from ignite.engine import Engine, Events
+from ignite.utils import _tree_apply2, _tree_map
 
 __all__ = ["Checkpoint", "DiskSaver", "ModelCheckpoint", "BaseSaveHandler"]
 
@@ -82,7 +83,7 @@ class Checkpoint(Serializable):
             ``load_state_dict`` methods. If contains objects of type torch `DistributedDataParallel`_ or
             `DataParallel`_, their internal wrapped model is automatically saved (to avoid additional key ``module.`` in
             the state dictionary).
-        save_handler: String, method or callable class
+        save_handler: String, function or callable object.
             used to save engine and other provided objects. Function receives two objects: checkpoint as a dictionary
             and filename. If ``save_handler`` is callable class, it can
             inherit of :class:`~ignite.handlers.checkpoint.BaseSaveHandler` and optionally implement ``remove`` method
@@ -102,6 +103,7 @@ class Checkpoint(Serializable):
             Input of the function is ``(engine, event_name)``. Output of function should be an integer.
             Default is None, global_step based on attached engine. If provided, uses function output as global_step.
             To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`.
+        archived: Deprecated argument as models saved by ``torch.save`` are already compressed.
         filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render
             checkpoint filenames. If the pattern is not defined, the default pattern would be used. See Note for
             details.
@@ -277,7 +279,7 @@ class Checkpoint(Serializable):
     """
 
     Item = NamedTuple("Item", [("priority", int), ("filename", str)])
-    _state_dict_all_req_keys = ("saved",)
+    _state_dict_all_req_keys = ("_saved",)
 
     def __init__(
         self,
@@ -288,12 +290,12 @@ def __init__(
         score_name: Optional[str] = None,
         n_saved: Union[int, None] = 1,
         global_step_transform: Optional[Callable] = None,
+        archived: bool = False,
         filename_pattern: Optional[str] = None,
         include_self: bool = False,
         greater_or_equal: bool = False,
         save_on_rank: int = 0,
     ):
-
         if not isinstance(to_save, collections.Mapping):
             raise TypeError(f"Argument `to_save` should be a dictionary, but given {type(to_save)}")
 
@@ -320,6 +322,8 @@ def __init__(
 
         if global_step_transform is not None and not callable(global_step_transform):
             raise TypeError(f"global_step_transform should be a function, got {type(global_step_transform)} instead.")
+        if archived:
+            warnings.warn("Argument archived is deprecated and will be removed in 0.4.14")
 
         self.to_save = to_save
         self.filename_prefix = filename_prefix
@@ -401,7 +405,6 @@ def _compare_fn(self, new: Union[int, float]) -> bool:
             return new > self._saved[0].priority
 
     def __call__(self, engine: Engine) -> None:
-
         global_step = None
         if self.global_step_transform is not None:
             global_step = self.global_step_transform(engine, engine.last_event_name)
@@ -416,7 +419,6 @@ def __call__(self, engine: Engine) -> None:
             priority = global_step
 
         if self._check_lt_n_saved() or self._compare_fn(priority):
-
             priority_str = f"{priority}" if isinstance(priority, numbers.Integral) else f"{priority:.4f}"
 
             checkpoint = self._setup_checkpoint()
@@ -469,18 +471,20 @@ def __call__(self, engine: Engine) -> None:
             except TypeError:
                 self.save_handler(checkpoint, filename)
 
-    def _setup_checkpoint(self) -> Dict[str, Dict[Any, Any]]:
-        checkpoint = {}
+    def _setup_checkpoint(self) -> Dict[str, Any]:
         if self.to_save is not None:
-            for k, obj in self.to_save.items():
+
+            def func(obj: Any, **kwargs: Any) -> Dict:
                 if isinstance(obj, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
                     obj = obj.module
                 elif HAVE_ZERO and isinstance(obj, ZeroRedundancyOptimizer):
                     obj.consolidate_state_dict(to=self.save_on_rank)
                     if self.save_on_rank != idist.get_rank():
-                        continue
-                checkpoint[k] = obj.state_dict()
-        return checkpoint
+                        return {}
+                return obj.state_dict()
+
+            return cast(Dict[str, Any], _tree_map(func, self.to_save))
+        return {}
 
     @staticmethod
     def setup_filename_pattern(
@@ -535,10 +539,12 @@ def setup_filename_pattern(
 
     @staticmethod
     def _check_objects(objs: Mapping, attr: str) -> None:
-        for k, obj in objs.items():
+        def func(obj: Any, **kwargs: Any) -> None:
             if not hasattr(obj, attr):
                 raise TypeError(f"Object {type(obj)} should have `{attr}` method")
 
+        _tree_map(func, objs)
+
     @staticmethod
     def load_objects(to_load: Mapping, checkpoint: Union[str, Mapping, Path], **kwargs: Any) -> None:
         """Helper method to apply ``load_state_dict`` on the objects from ``to_load`` using states from ``checkpoint``.
@@ -594,26 +600,22 @@ def load_objects(to_load: Mapping, checkpoint: Union[str, Mapping, Path], **kwar
             torch.nn.parallel.DistributedDataParallel.html
         .. _DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html
         """
+        if not isinstance(checkpoint, (collections.Mapping, str, Path)):
+            raise TypeError(f"Argument checkpoint should be a string or a dictionary, but given {type(checkpoint)}")
+
+        Checkpoint._check_objects(to_load, "load_state_dict")
 
         if isinstance(checkpoint, (str, Path)):
             checkpoint_obj = torch.load(checkpoint)
         else:
             checkpoint_obj = checkpoint
 
-        Checkpoint._check_objects(to_load, "load_state_dict")
-        if not isinstance(checkpoint, (collections.Mapping, str, Path)):
-            raise TypeError(f"Argument checkpoint should be a string or a dictionary, but given {type(checkpoint)}")
-
-        if len(kwargs) > 1 or any(k for k in kwargs if k not in ["strict"]):
-            warnings.warn("kwargs contains keys other than strict and these will be ignored")
-
-        is_state_dict_strict = kwargs.get("strict", True)
-
         def _load_object(obj: Any, chkpt_obj: Any) -> None:
             if isinstance(obj, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
                 obj = obj.module
+
             if isinstance(obj, torch.nn.Module):
-                obj.load_state_dict(chkpt_obj, strict=is_state_dict_strict)
+                obj.load_state_dict(chkpt_obj, **kwargs)
             else:
                 obj.load_state_dict(chkpt_obj)
 
@@ -624,11 +626,7 @@ def _load_object(obj: Any, chkpt_obj: Any) -> None:
                 _load_object(obj, checkpoint_obj)
                 return
 
-        # multiple objects to load
-        for k, obj in to_load.items():
-            if k not in checkpoint_obj:
-                raise ValueError(f"Object labeled by '{k}' from `to_load` is not found in the checkpoint")
-            _load_object(obj, checkpoint_obj[k])
+        _tree_apply2(_load_object, to_load, checkpoint_obj)
 
     def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, **filename_components: Any) -> None:
         """Helper method to apply ``load_state_dict`` on the objects from ``to_load``. Filename components such as
@@ -672,10 +670,18 @@ def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, *
             If ``to_load`` contains objects of type torch `DistributedDataParallel`_ or
             `DataParallel`_, method ``load_state_dict`` will applied to their internal wrapped model (``obj.module``).
 
+        Note:
+            This method works only when the ``save_handler`` is of types string,
+            :class:`~pathlib.Path` or :class:`~ignite.handlers.checkpoint.DiskSaver`.
+
         .. _DistributedDataParallel: https://pytorch.org/docs/stable/generated/
             torch.nn.parallel.DistributedDataParallel.html
         .. _DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html
         """
+        if not isinstance(self.save_handler, DiskSaver):
+            raise AttributeError(
+                f"Checkpoint's `save_handler` should be of type `DiskSaver`, given {type(self.save_handler)}"
+            )
 
         global_step = filename_components.get("global_step", None)
 
@@ -706,20 +712,21 @@ def reload_objects(self, to_load: Mapping, load_kwargs: Optional[Dict] = None, *
 
         Checkpoint.load_objects(to_load=to_load, checkpoint=path, **load_kwargs)
 
-    def state_dict(self) -> "OrderedDict[str, List[Tuple[int, str]]]":
+    def state_dict(self) -> OrderedDict:
         """Method returns state dict with saved items: list of ``(priority, filename)`` pairs.
         Can be used to save internal state of the class.
         """
-        return OrderedDict([("saved", [(p, f) for p, f in self._saved])])
+        # TODO: this method should use _state_dict_all_req_keys
+        return OrderedDict([("_saved", [(p, f) for p, f in self._saved])])
 
     def load_state_dict(self, state_dict: Mapping) -> None:
-        """Method replace internal state of the class with provided state dict data.
+        """Method replaces internal state of the class with provided state dict data.
 
         Args:
             state_dict: a dict with "saved" key and list of ``(priority, filename)`` pairs as values.
         """
         super().load_state_dict(state_dict)
-        self._saved = [Checkpoint.Item(p, f) for p, f in state_dict["saved"]]
+        self._saved = [Checkpoint.Item(p, f) for p, f in state_dict["_saved"]]
 
     @staticmethod
     def get_default_score_fn(metric_name: str, score_sign: float = 1.0) -> Callable:
@@ -878,6 +885,11 @@ class ModelCheckpoint(Checkpoint):
 
         Behaviour of this class has been changed since v0.3.0.
 
+        Argument ``save_as_state_dict`` is deprecated and should not be used. It is considered as True.
+
+        Argument ``save_interval`` is deprecated and should not be used. Please, use events filtering instead, e.g.
+        ``Events.ITERATION_STARTED(every=1000)``.
+
         There is no more internal counter that has been used to indicate the number of save actions. User could
         see its value `step_number` in the filename, e.g. `{filename_prefix}_{name}_{step_number}.pt`. Actually,
         `step_number` is replaced by current engine's epoch if `score_function` is specified and current iteration
@@ -906,6 +918,7 @@ class ModelCheckpoint(Checkpoint):
             Input of the function is `(engine, event_name)`. Output of function should be an integer.
             Default is None, global_step based on attached engine. If provided, uses function output as global_step.
             To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`.
+        archived: Deprecated argument as models saved by `torch.save` are already compressed.
         filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render
             checkpoint filenames. If the pattern is not defined, the default pattern would be used.
             See :class:`~ignite.handlers.checkpoint.Checkpoint` for details.
@@ -952,19 +965,38 @@ def __init__(
         self,
         dirname: Union[str, Path],
         filename_prefix: str = "",
+        save_interval: Optional[int] = None,
         score_function: Optional[Callable] = None,
         score_name: Optional[str] = None,
         n_saved: Union[int, None] = 1,
         atomic: bool = True,
         require_empty: bool = True,
         create_dir: bool = True,
+        save_as_state_dict: bool = True,
         global_step_transform: Optional[Callable] = None,
+        archived: bool = False,
         filename_pattern: Optional[str] = None,
         include_self: bool = False,
         greater_or_equal: bool = False,
         save_on_rank: int = 0,
         **kwargs: Any,
     ):
+        if not save_as_state_dict:
+            raise ValueError(
+                "Argument save_as_state_dict is deprecated and should be True."
+                "This argument will be removed in 0.4.14."
+            )
+        if save_interval is not None:
+            msg = (
+                "Argument save_interval is deprecated and should be None. This argument will be removed in 0.4.14."
+                "Please, use events filtering instead, e.g. Events.ITERATION_STARTED(every=1000)"
+            )
+            if save_interval == 1:
+                # Do not break for old version who used `save_interval=1`
+                warnings.warn(msg)
+            else:
+                # No choice
+                raise ValueError(msg)
 
         disk_saver = DiskSaver(
             dirname,
@@ -984,6 +1016,7 @@ def __init__(
             n_saved=n_saved,
             global_step_transform=global_step_transform,
             filename_pattern=filename_pattern,
+            archived=archived,
             include_self=include_self,
             greater_or_equal=greater_or_equal,
             save_on_rank=save_on_rank,
@@ -1000,7 +1033,6 @@ def last_checkpoint(self) -> Optional[Union[str, Path]]:
         return self.save_handler.dirname / self._saved[-1].filename
 
     def __call__(self, engine: Engine, to_save: Mapping):  # type: ignore
-
         if len(to_save) == 0:
             raise RuntimeError("No objects to checkpoint found.")
 
diff --git a/ignite/handlers/early_stopping.py b/ignite/handlers/early_stopping.py
index 3eaed6791c72..d308f8499e00 100644
--- a/ignite/handlers/early_stopping.py
+++ b/ignite/handlers/early_stopping.py
@@ -50,7 +50,6 @@ def __init__(
         min_delta: float = 0.0,
         cumulative_delta: bool = False,
     ):
-
         if not callable(score_function):
             raise TypeError("Argument score_function should be a function.")
 
diff --git a/ignite/handlers/lr_finder.py b/ignite/handlers/lr_finder.py
index 69c176e93da3..1aad00938337 100644
--- a/ignite/handlers/lr_finder.py
+++ b/ignite/handlers/lr_finder.py
@@ -94,7 +94,6 @@ def _run(
         smooth_f: float,
         diverge_th: float,
     ) -> None:
-
         self._history = {"lr": [], "loss": []}
         self._best_loss = None
         self._diverge_flag = False
@@ -106,7 +105,6 @@ def _run(
             max_iter = trainer.state.epoch_length * trainer.state.max_epochs  # type: ignore[operator]
             if max_iter < num_iter:
                 max_iter = num_iter
-                trainer.state.max_iters = num_iter
                 trainer.state.max_epochs = ceil(num_iter / trainer.state.epoch_length)  # type: ignore[operator]
 
         if not trainer.has_event_handler(self._reached_num_iterations):
diff --git a/ignite/handlers/param_scheduler.py b/ignite/handlers/param_scheduler.py
index 9ece11f106d5..c554b04bce70 100644
--- a/ignite/handlers/param_scheduler.py
+++ b/ignite/handlers/param_scheduler.py
@@ -10,7 +10,7 @@
 from typing import Any, cast, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
 
 import torch
-from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
 from torch.optim.optimizer import Optimizer
 
 # https://github.com/pytorch/ignite/issues/2773
@@ -193,8 +193,7 @@ def __init__(
         self._state_attrs += ["param_group_index"]
 
     def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None:
-
-        value = self.get_param()
+        value = self._get_param()
 
         if isinstance(value, list):
             if len(value) != len(self.optimizer_param_groups):
@@ -262,6 +261,11 @@ def simulate_values(cls, num_events: int, **scheduler_kwargs: Any) -> List[List[
             values.append([i, scheduler.optimizer_param_groups[0][scheduler.param_name]])
         return values
 
+    def _get_param(self) -> Union[List[float], float]:
+        # `ParamScheduler` does nothing special, only returning what child class returns.
+        # Intermediate child classes edit this method
+        return self.get_param()
+
 
 class CyclicalScheduler(ParamScheduler):
     """An abstract class for updating an optimizer's parameter value over a
@@ -280,6 +284,9 @@ class CyclicalScheduler(ParamScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -289,6 +296,9 @@ class CyclicalScheduler(ParamScheduler):
         usually be the number of batches in an epoch.
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def __init__(
@@ -301,6 +311,7 @@ def __init__(
         cycle_mult: float = 1.0,
         start_value_mult: float = 1.0,
         end_value_mult: float = 1.0,
+        warmup_duration: int = 0,
         save_history: bool = False,
         param_group_index: Optional[int] = None,
     ):
@@ -309,11 +320,13 @@ def __init__(
         )
         self.start_value = start_value
         self.end_value = end_value
-        self.cycle_size = int(cycle_size)  # Ensure cycle_size is integer
+        self.cycle_size = cycle_size
         self.cycle_mult = cycle_mult
         self.cycle = 0
         self.start_value_mult = start_value_mult
         self.end_value_mult = end_value_mult
+        self.warmup_duration = warmup_duration
+        self.total_cycle_size = self.warmup_duration + self.cycle_size
 
         if self.cycle_size < 2:
             raise ValueError(f"Argument cycle_size should be positive and larger than 1, but given {cycle_size}")
@@ -326,18 +339,33 @@ def __init__(
             "cycle",
             "start_value_mult",
             "end_value_mult",
+            "warmup_duration",
+            "total_cycle_size",
         ]
 
     def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None:
-        if self.event_index != 0 and self.event_index % self.cycle_size == 0:
+        if self.event_index != 0 and self.event_index == self.cycle_size:
+            self.start_value *= self.start_value_mult
+        if self.event_index != 0 and self.event_index == self.total_cycle_size:
             self.event_index = 0
             self.cycle_size = int(self.cycle_size * self.cycle_mult)
+            self.warmup_duration = int(self.warmup_duration * self.cycle_mult)
+            self.total_cycle_size = self.warmup_duration + self.cycle_size
             self.cycle += 1
-            self.start_value *= self.start_value_mult
             self.end_value *= self.end_value_mult
 
         return super(CyclicalScheduler, self).__call__(engine, name)
 
+    def _get_param(self) -> Union[List[float], float]:
+        """Applies warm-up if the scheduler is in the warm-up phase,
+        otherwise returns what is returned by `self.get_param()`
+        """
+        if self.event_index > self.cycle_size:
+            warmup_progress = (self.event_index - self.cycle_size) / self.warmup_duration
+            return self.end_value + (self.start_value - self.end_value) * warmup_progress
+
+        return self.get_param()
+
 
 class LinearCyclicalScheduler(CyclicalScheduler):
     """Linearly adjusts param value to 'end_value' for a half-cycle, then linearly
@@ -356,6 +384,9 @@ class LinearCyclicalScheduler(CyclicalScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -431,9 +462,13 @@ def print_lr():
             ...
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def get_param(self) -> float:
+        """Method to get current optimizer's parameter value"""
         cycle_progress = self.event_index / self.cycle_size
         return self.end_value + (self.start_value - self.end_value) * abs(cycle_progress - 0.5) * 2
 
@@ -457,6 +492,9 @@ class CosineAnnealingScheduler(CyclicalScheduler):
             end of each cycle (default=1.0).
         end_value_mult: ratio by which to change the end value at the
             end of each cycle (default=1.0).
+        warmup_duration: duration of warm-up to be applied before each cycle.
+            Through this warm-up, the parameter starts from the last cycle's end value
+            and linearly goes to next cycle's start value. Default is no cyclic warm-up.
         save_history: whether to log the parameter values to
             `engine.state.param_history`, (default=False).
         param_group_index: optimizer's parameters group to use.
@@ -535,6 +573,9 @@ def print_lr():
                  Applications of Computer Vision (WACV), 2017 IEEE Winter Conference on. IEEE, 2017
 
     .. versionadded:: 0.4.5
+
+    .. versionchanged:: 0.4.13
+        Added cyclic warm-up to the scheduler using ``warmup_duration``.
     """
 
     def get_param(self) -> float:
@@ -595,7 +636,6 @@ def print_lr():
     """
 
     def __init__(self, schedulers: List[ParamScheduler], durations: List[int], save_history: bool = False):
-
         if not isinstance(schedulers, Sequence):
             raise TypeError(f"Argument schedulers should be a sequence, but given {schedulers}")
 
@@ -794,6 +834,57 @@ def simulate_values(  # type: ignore[override]
             return output
 
 
+class _CosineAnnealingWarmRestarts:
+    def __init__(self, lr_scheduler: CosineAnnealingWarmRestarts):
+        self._lr_scheduler = lr_scheduler
+
+    @property
+    def last_epoch(self) -> int:
+        return self._lr_scheduler.last_epoch
+
+    @last_epoch.setter
+    def last_epoch(self, value: int) -> None:
+        self._lr_scheduler.last_epoch = value
+
+    @property
+    def optimizer(self) -> torch.optim.Optimizer:
+        return self._lr_scheduler.optimizer
+
+    def get_lr(self, epoch: Optional[int] = None) -> List[float]:
+        T_mult = self._lr_scheduler.T_mult
+        eta_min = self._lr_scheduler.eta_min
+
+        if epoch is None and self.last_epoch < 0:
+            epoch = 0
+        if epoch is None:
+            epoch = self.last_epoch + 1
+            self._lr_scheduler.T_cur = self._lr_scheduler.T_cur + 1
+            if self._lr_scheduler.T_cur >= self._lr_scheduler.T_i:
+                self._lr_scheduler.T_cur = self._lr_scheduler.T_cur - self._lr_scheduler.T_i
+                self._lr_scheduler.T_i = self._lr_scheduler.T_i * T_mult
+        else:
+            if epoch < 0:
+                raise ValueError("Expected non-negative epoch, but got {}".format(epoch))
+            if epoch >= self._lr_scheduler.T_0:
+                if T_mult == 1:
+                    self._lr_scheduler.T_cur = epoch % self._lr_scheduler.T_0
+                else:
+                    n = int(math.log((epoch / self._lr_scheduler.T_0 * (T_mult - 1) + 1), T_mult))
+                    self._lr_scheduler.T_cur = epoch - self._lr_scheduler.T_0 * (T_mult**n - 1) / (T_mult - 1)
+                    self._lr_scheduler.T_i = self._lr_scheduler.T_0 * T_mult**n
+            else:
+                self._lr_scheduler.T_i = self._lr_scheduler.T_0
+                self._lr_scheduler.T_cur = epoch
+
+        self.last_epoch = math.floor(epoch)
+
+        return [
+            eta_min
+            + (base_lr - eta_min) * (1 + math.cos(math.pi * self._lr_scheduler.T_cur / self._lr_scheduler.T_i)) / 2
+            for base_lr in self._lr_scheduler.base_lrs
+        ]
+
+
 class LRScheduler(ParamScheduler):
     """A wrapper class to call `torch.optim.lr_scheduler` objects as `ignite` handlers.
 
@@ -848,7 +939,6 @@ def __init__(
         save_history: bool = False,
         use_legacy: bool = False,
     ):
-
         if not isinstance(lr_scheduler, PyTorchLRScheduler):
             raise TypeError(
                 "Argument lr_scheduler should be a subclass of "
@@ -856,7 +946,10 @@ def __init__(
                 f"but given {type(lr_scheduler)}"
             )
 
-        self.lr_scheduler = lr_scheduler
+        self.lr_scheduler: Union[PyTorchLRScheduler, _CosineAnnealingWarmRestarts] = lr_scheduler
+        if isinstance(lr_scheduler, CosineAnnealingWarmRestarts):
+            self.lr_scheduler = _CosineAnnealingWarmRestarts(lr_scheduler)
+
         super(LRScheduler, self).__init__(
             optimizer=self.lr_scheduler.optimizer,
             param_name="lr",
@@ -866,7 +959,7 @@ def __init__(
             warnings.warn(
                 "Please make sure to attach scheduler to Events.ITERATION_COMPLETED "
                 "instead of Events.ITERATION_STARTED to make sure to use "
-                "the first lr value from the optimizer, otherwise it is will be skipped"
+                "the first lr value from the optimizer, otherwise it will be skipped"
             )
             self.lr_scheduler.last_epoch += 1
 
@@ -879,9 +972,9 @@ def __call__(self, engine: Optional[Engine], name: Optional[str] = None) -> None
     def get_param(self) -> Union[float, List[float]]:
         """Method to get current optimizer's parameter value"""
         # Emulate context manager for pytorch>=1.4
-        self.lr_scheduler._get_lr_called_within_step = True  # type: ignore[attr-defined]
+        self.lr_scheduler._get_lr_called_within_step = True  # type: ignore[union-attr]
         lr_list = cast(List[float], self.lr_scheduler.get_lr())
-        self.lr_scheduler._get_lr_called_within_step = False  # type: ignore[attr-defined]
+        self.lr_scheduler._get_lr_called_within_step = False  # type: ignore[union-attr]
         if len(lr_list) == 1:
             return lr_list[0]
         else:
@@ -1019,7 +1112,6 @@ def print_lr():
     warmup_schedulers: List[ParamScheduler] = []
 
     for param_group_index, param_group in enumerate(lr_scheduler.optimizer.param_groups):
-
         if warmup_end_value is None:
             param_group_warmup_end_value = param_group["lr"]
         else:
@@ -1458,20 +1550,19 @@ class ReduceLROnPlateauScheduler(ParamScheduler):
             Default: False.
         param_group_index: `optimizer`'s parameters group
             to use.  Default: None. Use all `optimizer`'s paramater groups.
-        **scheduler_kwargs: Keyword arguments to be passed to the wrapped
-            `ReduceLROnPlateau`.
+        scheduler_kwargs: Keyword arguments to be passed to the wrapped ``ReduceLROnPlateau``.
 
     Examples:
 
-        .. code-block python
+        .. code-block:: python
 
-            # Metric 'metric-name' should surpass its best value by
+            # Metric "accuracy" should increase the best value by
             # more than 1 unit after at most 2 epochs, otherwise LR
             # would get multiplied by 0.5 .
 
             scheduler = ReduceLROnPlateauScheduler(
                 default_optimizer,
-                metric_name="metric-name", mode="max",
+                metric_name="accuracy", mode="max",
                 factor=0.5, patience=1, threshold_mode='abs',
                 threshold=1, trainer=trainer
             )
@@ -1488,10 +1579,10 @@ class ReduceLROnPlateauScheduler(ParamScheduler):
 
             default_trainer = get_default_trainer()
 
-            # Metric `loss` should decrease more than
-            # a tenth of best loss after at most
+            # Metric "loss" should decrease more than
+            # 0.1 of best loss after at most
             # three iterations. Then best loss would get
-            # updated, otherwise lr is multiplied by 2
+            # updated, otherwise lr is multiplied by 0.5
 
             scheduler = ReduceLROnPlateauScheduler(
                 default_optimizer, "loss",
diff --git a/ignite/handlers/state_param_scheduler.py b/ignite/handlers/state_param_scheduler.py
index 03099c59739e..5922ee1f8b24 100644
--- a/ignite/handlers/state_param_scheduler.py
+++ b/ignite/handlers/state_param_scheduler.py
@@ -382,7 +382,7 @@ def __init__(
         self._state_attrs += ["initial_value", "gamma"]
 
     def get_param(self) -> Union[List[float], float]:
-        return self.initial_value * self.gamma ** self.event_index
+        return self.initial_value * self.gamma**self.event_index
 
 
 class StepStateScheduler(StateParamScheduler):
diff --git a/ignite/handlers/terminate_on_nan.py b/ignite/handlers/terminate_on_nan.py
index 32f8053c652a..bf8baa5a73e0 100644
--- a/ignite/handlers/terminate_on_nan.py
+++ b/ignite/handlers/terminate_on_nan.py
@@ -40,7 +40,6 @@ def __call__(self, engine: Engine) -> None:
         output = self._output_transform(engine.state.output)
 
         def raise_error(x: Union[float, torch.Tensor]) -> None:
-
             if isinstance(x, numbers.Number):
                 x = torch.tensor(x)
 
diff --git a/ignite/handlers/time_limit.py b/ignite/handlers/time_limit.py
index e5fb4ad7a25f..4ab934a2534d 100644
--- a/ignite/handlers/time_limit.py
+++ b/ignite/handlers/time_limit.py
@@ -29,7 +29,6 @@ class TimeLimit:
     """
 
     def __init__(self, limit_sec: Optional[int] = 28800):
-
         if not isinstance(limit_sec, int):
             raise TypeError("Argument limit_sec should be an integer.")
         if limit_sec <= 0:
diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index af73f88266c8..426b35a21abc 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -38,6 +38,7 @@ class VariableAccumulation(Metric):
     """
 
     required_output_keys = None
+    _state_dict_all_req_keys = ("accumulator", "num_examples")
 
     def __init__(
         self,
diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 9548f962bf98..0bfe62b85b7b 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -51,10 +51,10 @@ def _check_shape(self, output: Sequence[torch.Tensor]) -> None:
     def _check_binary_multilabel_cases(self, output: Sequence[torch.Tensor]) -> None:
         y_pred, y = output
 
-        if not torch.equal(y, y ** 2):
+        if not torch.equal(y, y**2):
             raise ValueError("For binary cases, y must be comprised of 0's and 1's.")
 
-        if not torch.equal(y_pred, y_pred ** 2):
+        if not torch.equal(y_pred, y_pred**2):
             raise ValueError("For binary cases, y_pred must be comprised of 0's and 1's.")
 
     def _check_type(self, output: Sequence[torch.Tensor]) -> None:
@@ -208,6 +208,8 @@ def thresholded_output_transform(output):
             0.6666...
     """
 
+    _state_dict_all_req_keys = ("_num_correct", "_num_examples")
+
     def __init__(
         self,
         output_transform: Callable = lambda x: x,
diff --git a/ignite/metrics/classification_report.py b/ignite/metrics/classification_report.py
index 40809b0eef5c..55613dc8d8cd 100644
--- a/ignite/metrics/classification_report.py
+++ b/ignite/metrics/classification_report.py
@@ -131,12 +131,12 @@ def _wrapper(
             dict_obj[_get_label_for_class(idx)] = {
                 "precision": p_label.item(),
                 "recall": re[idx].item(),
-                "f{0}-score".format(beta): f[idx].item(),
+                f"f{beta}-score": f[idx].item(),
             }
         dict_obj["macro avg"] = {
             "precision": a_pr.item(),
             "recall": a_re.item(),
-            "f{0}-score".format(beta): a_f.item(),
+            f"f{beta}-score": a_f.item(),
         }
         return dict_obj if output_dict else json.dumps(dict_obj)
 
diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index a5021631cddb..a55bbedebb8f 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -99,6 +99,8 @@ def binary_one_hot_output_transform(output):
                     [1, 1]])
     """
 
+    _state_dict_all_req_keys = ("confusion_matrix", "_num_examples")
+
     def __init__(
         self,
         num_classes: int,
@@ -166,7 +168,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
         y_pred = y_pred[target_mask]
 
         indices = self.num_classes * y + y_pred
-        m = torch.bincount(indices, minlength=self.num_classes ** 2).reshape(self.num_classes, self.num_classes)
+        m = torch.bincount(indices, minlength=self.num_classes**2).reshape(self.num_classes, self.num_classes)
         self.confusion_matrix += m.to(self.confusion_matrix)
 
     @sync_all_reduce("confusion_matrix", "_num_examples")
diff --git a/ignite/metrics/epoch_metric.py b/ignite/metrics/epoch_metric.py
index 21b199bfd542..116a841e49ff 100644
--- a/ignite/metrics/epoch_metric.py
+++ b/ignite/metrics/epoch_metric.py
@@ -67,6 +67,8 @@ def mse_fn(y_preds, y_targets):
         To disable the warning, set ``check_compute_fn=False``.
     """
 
+    _state_dict_all_req_keys = ("_predictions", "_targets")
+
     def __init__(
         self,
         compute_fn: Callable[[torch.Tensor, torch.Tensor], float],
@@ -74,7 +76,6 @@ def __init__(
         check_compute_fn: bool = True,
         device: Union[str, torch.device] = torch.device("cpu"),
     ) -> None:
-
         if not callable(compute_fn):
             raise TypeError("Argument compute_fn should be callable.")
 
diff --git a/ignite/metrics/fbeta.py b/ignite/metrics/fbeta.py
index 6e87ed4910e5..6522efc64231 100644
--- a/ignite/metrics/fbeta.py
+++ b/ignite/metrics/fbeta.py
@@ -167,7 +167,7 @@ def thresholded_output_transform(output):
     elif recall._average:
         raise ValueError("Input recall metric should have average=False")
 
-    fbeta = (1.0 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall + 1e-15)
+    fbeta = (1.0 + beta**2) * precision * recall / (beta**2 * precision + recall + 1e-15)
 
     if average:
         fbeta = fbeta.mean().item()
diff --git a/ignite/metrics/gan/fid.py b/ignite/metrics/gan/fid.py
index fc1065f697b8..54c5df06f57d 100644
--- a/ignite/metrics/gan/fid.py
+++ b/ignite/metrics/gan/fid.py
@@ -21,7 +21,6 @@
 def fid_score(
     mu1: torch.Tensor, mu2: torch.Tensor, sigma1: torch.Tensor, sigma2: torch.Tensor, eps: float = 1e-6
 ) -> float:
-
     try:
         import numpy as np
     except ImportError:
@@ -164,6 +163,8 @@ def forward(self, x):
     .. versionadded:: 0.4.6
     """
 
+    _state_dict_all_req_keys = ("_num_examples", "_train_total", "_test_total", "_train_sigma", "_test_sigma")
+
     def __init__(
         self,
         num_features: Optional[int] = None,
@@ -171,7 +172,6 @@ def __init__(
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
     ) -> None:
-
         try:
             import numpy as np  # noqa: F401
         except ImportError:
@@ -197,7 +197,6 @@ def __init__(
 
     @staticmethod
     def _online_update(features: torch.Tensor, total: torch.Tensor, sigma: torch.Tensor) -> None:
-
         total += features
         sigma += torch_outer(features, features)
 
@@ -213,7 +212,6 @@ def _get_covariance(self, sigma: torch.Tensor, total: torch.Tensor) -> torch.Ten
 
     @reinit__is_reduced
     def reset(self) -> None:
-
         self._train_sigma = torch.zeros(
             (self._num_features, self._num_features), dtype=torch.float64, device=self._device
         )
@@ -231,7 +229,6 @@ def reset(self) -> None:
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
-
         train, test = output
         train_features = self._extract_features(train)
         test_features = self._extract_features(test)
@@ -255,7 +252,6 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
 
     @sync_all_reduce("_num_examples", "_train_total", "_test_total", "_train_sigma", "_test_sigma")
     def compute(self) -> float:
-
         fid = fid_score(
             mu1=self._train_total / self._num_examples,
             mu2=self._test_total / self._num_examples,
diff --git a/ignite/metrics/gan/inception_score.py b/ignite/metrics/gan/inception_score.py
index 9676194019d7..60b1d4785f71 100644
--- a/ignite/metrics/gan/inception_score.py
+++ b/ignite/metrics/gan/inception_score.py
@@ -77,6 +77,8 @@ class InceptionScore(_BaseInceptionMetric):
     .. versionadded:: 0.4.6
     """
 
+    _state_dict_all_req_keys = ("_num_examples", "_prob_total", "_total_kl_d")
+
     def __init__(
         self,
         num_features: Optional[int] = None,
@@ -84,7 +86,6 @@ def __init__(
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
     ) -> None:
-
         if num_features is None and feature_extractor is None:
             num_features = 1000
             feature_extractor = InceptionModel(return_features=False, device=device)
@@ -100,7 +101,6 @@ def __init__(
 
     @reinit__is_reduced
     def reset(self) -> None:
-
         self._num_examples = 0
 
         self._prob_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
@@ -110,7 +110,6 @@ def reset(self) -> None:
 
     @reinit__is_reduced
     def update(self, output: torch.Tensor) -> None:
-
         probabilities = self._extract_features(output)
 
         prob_sum = torch.sum(probabilities, 0, dtype=torch.float64)
@@ -125,7 +124,6 @@ def update(self, output: torch.Tensor) -> None:
 
     @sync_all_reduce("_num_examples", "_prob_total", "_total_kl_d")
     def compute(self) -> float:
-
         if self._num_examples == 0:
             raise NotComputableError("InceptionScore must have at least one example before it can be computed.")
 
diff --git a/ignite/metrics/gan/utils.py b/ignite/metrics/gan/utils.py
index dc501867353b..f8226dafd1df 100644
--- a/ignite/metrics/gan/utils.py
+++ b/ignite/metrics/gan/utils.py
@@ -57,7 +57,6 @@ def __init__(
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
     ) -> None:
-
         if num_features is None:
             raise ValueError("Argument num_features must be provided, if feature_extractor is specified.")
 
@@ -78,7 +77,6 @@ def __init__(
         super(_BaseInceptionMetric, self).__init__(output_transform=output_transform, device=device)
 
     def _check_feature_shapes(self, samples: torch.Tensor) -> None:
-
         if samples.dim() != 2:
             raise ValueError(f"feature_extractor output must be a tensor of dim 2, got: {samples.dim()}")
 
@@ -91,7 +89,6 @@ def _check_feature_shapes(self, samples: torch.Tensor) -> None:
             )
 
     def _extract_features(self, inputs: torch.Tensor) -> torch.Tensor:
-
         inputs = inputs.detach()
 
         if inputs.device != torch.device(self._device):
diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index 71f67db4979f..7182e7033d54 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -65,6 +65,7 @@ class Loss(Metric):
     """
 
     required_output_keys = ("y_pred", "y", "criterion_kwargs")
+    _state_dict_all_req_keys = ("_sum", "_num_examples")
 
     def __init__(
         self,
diff --git a/ignite/metrics/mean_absolute_error.py b/ignite/metrics/mean_absolute_error.py
index 6066e8cda341..eb90d3aa3c24 100644
--- a/ignite/metrics/mean_absolute_error.py
+++ b/ignite/metrics/mean_absolute_error.py
@@ -59,6 +59,8 @@ class MeanAbsoluteError(Metric):
             2.9375
     """
 
+    _state_dict_all_req_keys = ("_sum_of_absolute_errors", "_num_examples")
+
     @reinit__is_reduced
     def reset(self) -> None:
         self._sum_of_absolute_errors = torch.tensor(0.0, device=self._device)
diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py
index 4b33dc4cbab5..79676564e5fb 100644
--- a/ignite/metrics/mean_pairwise_distance.py
+++ b/ignite/metrics/mean_pairwise_distance.py
@@ -59,6 +59,8 @@ class MeanPairwiseDistance(Metric):
             1.5955...
     """
 
+    _state_dict_all_req_keys = ("_sum_of_distances", "_num_examples")
+
     def __init__(
         self,
         p: int = 2,
diff --git a/ignite/metrics/mean_squared_error.py b/ignite/metrics/mean_squared_error.py
index 3752b728138d..3407b4adcb70 100644
--- a/ignite/metrics/mean_squared_error.py
+++ b/ignite/metrics/mean_squared_error.py
@@ -59,6 +59,8 @@ class MeanSquaredError(Metric):
             3.828125
     """
 
+    _state_dict_all_req_keys = ("_sum_of_squared_errors", "_num_examples")
+
     @reinit__is_reduced
     def reset(self) -> None:
         self._sum_of_squared_errors = torch.tensor(0.0, device=self._device)
diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index 26cb3c12560d..39e5cb745222 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -1,18 +1,31 @@
 from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
 from collections.abc import Mapping
 from functools import wraps
 from numbers import Number
-from typing import Any, Callable, cast, Dict, Optional, Sequence, Tuple, TYPE_CHECKING, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Tuple, TYPE_CHECKING, Union
 
 import torch
 
 import ignite.distributed as idist
+
+from ignite.base.mixins import Serializable
 from ignite.engine import CallableEventWithFilter, Engine, Events
+from ignite.utils import _CollectionItem, _tree_apply2, _tree_map
 
 if TYPE_CHECKING:
     from ignite.metrics.metrics_lambda import MetricsLambda
 
-__all__ = ["Metric", "MetricUsage", "EpochWise", "BatchWise", "BatchFiltered"]
+__all__ = [
+    "Metric",
+    "MetricUsage",
+    "EpochWise",
+    "BatchWise",
+    "BatchFiltered",
+    "RunningEpochWise",
+    "RunningBatchWise",
+    "SingleEpochRunningBatchWise",
+]
 
 
 class MetricUsage:
@@ -31,6 +44,8 @@ class MetricUsage:
             :meth:`~ignite.metrics.metric.Metric.iteration_completed`.
     """
 
+    usage_name: str
+
     def __init__(self, started: Events, completed: Events, iteration_completed: CallableEventWithFilter) -> None:
         self.__started = started
         self.__completed = completed
@@ -74,6 +89,33 @@ def __init__(self) -> None:
         )
 
 
+class RunningEpochWise(EpochWise):
+    """
+    Running epoch-wise usage of Metrics. It's the running version of the :class:`~.metrics.metric.EpochWise` metric
+    usage. A metric with such a usage most likely accompanies an :class:`~.metrics.metric.EpochWise` one to compute
+    a running measure of it e.g. running average.
+
+    Metric's methods are triggered on the following engine events:
+
+    - :meth:`~ignite.metrics.metric.Metric.started` on every ``STARTED``
+      (See :class:`~ignite.engine.events.Events`).
+    - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``EPOCH_COMPLETED``.
+    - :meth:`~ignite.metrics.metric.Metric.completed` on every ``EPOCH_COMPLETED``.
+
+    Attributes:
+        usage_name: usage name string
+    """
+
+    usage_name: str = "running_epoch_wise"
+
+    def __init__(self) -> None:
+        super(EpochWise, self).__init__(
+            started=Events.STARTED,
+            completed=Events.EPOCH_COMPLETED,
+            iteration_completed=Events.EPOCH_COMPLETED,
+        )
+
+
 class BatchWise(MetricUsage):
     """
     Batch-wise usage of Metrics.
@@ -99,6 +141,59 @@ def __init__(self) -> None:
         )
 
 
+class RunningBatchWise(BatchWise):
+    """
+    Running batch-wise usage of Metrics. It's the running version of the :class:`~.metrics.metric.EpochWise` metric
+    usage. A metric with such a usage could for example accompany a :class:`~.metrics.metric.BatchWise` one to compute
+    a running measure of it e.g. running average.
+
+    Metric's methods are triggered on the following engine events:
+
+    - :meth:`~ignite.metrics.metric.Metric.started` on every ``STARTED``
+      (See :class:`~ignite.engine.events.Events`).
+    - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``ITERATION_COMPLETED``.
+    - :meth:`~ignite.metrics.metric.Metric.completed` on every ``ITERATION_COMPLETED``.
+
+    Attributes:
+        usage_name: usage name string
+    """
+
+    usage_name: str = "running_batch_wise"
+
+    def __init__(self) -> None:
+        super(BatchWise, self).__init__(
+            started=Events.STARTED,
+            completed=Events.ITERATION_COMPLETED,
+            iteration_completed=Events.ITERATION_COMPLETED,
+        )
+
+
+class SingleEpochRunningBatchWise(BatchWise):
+    """
+    Running batch-wise usage of Metrics in a single epoch. It's like :class:`~.metrics.metric.RunningBatchWise` metric
+    usage with the difference that is used during a single epoch.
+
+    Metric's methods are triggered on the following engine events:
+
+    - :meth:`~ignite.metrics.metric.Metric.started` on every ``EPOCH_STARTED``
+      (See :class:`~ignite.engine.events.Events`).
+    - :meth:`~ignite.metrics.metric.Metric.iteration_completed` on every ``ITERATION_COMPLETED``.
+    - :meth:`~ignite.metrics.metric.Metric.completed` on every ``ITERATION_COMPLETED``.
+
+    Attributes:
+        usage_name: usage name string
+    """
+
+    usage_name: str = "single_epoch_running_batch_wise"
+
+    def __init__(self) -> None:
+        super(BatchWise, self).__init__(
+            started=Events.EPOCH_STARTED,
+            completed=Events.ITERATION_COMPLETED,
+            iteration_completed=Events.ITERATION_COMPLETED,
+        )
+
+
 class BatchFiltered(MetricUsage):
     """
     Batch filtered usage of Metrics. This usage is similar to epoch-wise but update event is filtered.
@@ -125,7 +220,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         )
 
 
-class Metric(metaclass=ABCMeta):
+class Metric(Serializable, metaclass=ABCMeta):
     """
     Base class for all Metrics.
 
@@ -219,7 +314,7 @@ def __init__(
     @abstractmethod
     def reset(self) -> None:
         """
-        Resets the metric to it's initial state.
+        Resets the metric to its initial state.
 
         By default, this is called at the start of each epoch.
         """
@@ -240,7 +335,7 @@ def update(self, output: Any) -> None:
     @abstractmethod
     def compute(self) -> Any:
         """
-        Computes the metric based on it's accumulated state.
+        Computes the metric based on its accumulated state.
 
         By default, this is called at the end of each epoch.
 
@@ -273,7 +368,7 @@ def iteration_completed(self, engine: Engine) -> None:
 
         Note:
             ``engine.state.output`` is used to compute metric values.
-            The majority of implemented metrics accepts the following formats for ``engine.state.output``:
+            The majority of implemented metrics accept the following formats for ``engine.state.output``:
             ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``. ``y_pred`` and ``y`` can be torch tensors or
             list of tensors/numbers if applicable.
 
@@ -344,12 +439,16 @@ def completed(self, engine: Engine, name: str) -> None:
 
     def _check_usage(self, usage: Union[str, MetricUsage]) -> MetricUsage:
         if isinstance(usage, str):
-            if usage == EpochWise.usage_name:
-                usage = EpochWise()
-            elif usage == BatchWise.usage_name:
-                usage = BatchWise()
-            else:
-                raise ValueError(f"usage should be 'EpochWise.usage_name' or 'BatchWise.usage_name', get {usage}")
+            usages = [EpochWise, RunningEpochWise, BatchWise, RunningBatchWise, SingleEpochRunningBatchWise]
+            for usage_cls in usages:
+                if usage == usage_cls.usage_name:
+                    usage = usage_cls()
+                    break
+            if not isinstance(usage, MetricUsage):
+                raise ValueError(
+                    "Argument usage should be '(Running)EpochWise.usage_name' or "
+                    f"'((SingleEpoch)Running)BatchWise.usage_name', got {usage}"
+                )
         if not isinstance(usage, MetricUsage):
             raise TypeError(f"Unhandled usage type {type(usage)}")
         return usage
@@ -451,6 +550,97 @@ def is_attached(self, engine: Engine, usage: Union[str, MetricUsage] = EpochWise
         usage = self._check_usage(usage)
         return engine.has_event_handler(self.completed, usage.COMPLETED)
 
+    def _state_dict_per_rank(self) -> OrderedDict:
+        def func(
+            x: Union[torch.Tensor, Metric, None, float], **kwargs: Any
+        ) -> Union[torch.Tensor, float, OrderedDict, None]:
+            if isinstance(x, Metric):
+                return x._state_dict_per_rank()
+            if x is None or isinstance(x, (int, float, torch.Tensor)):
+                return x
+            else:
+                raise TypeError(
+                    "Found attribute of unsupported type. Currently, supported types include"
+                    " numeric types, tensor, Metric or sequence/mapping of metrics."
+                )
+
+        state: OrderedDict[str, Union[torch.Tensor, List, Dict, None]] = OrderedDict()
+        for attr_name in self._state_dict_all_req_keys:
+            if attr_name not in self.__dict__:
+                raise ValueError(
+                    f"Found a value in _state_dict_all_req_keys that is not among metric attributes: {attr_name}"
+                )
+            attr = getattr(self, attr_name)
+            state[attr_name] = _tree_map(func, attr)  # type: ignore[assignment]
+
+        return state
+
+    __state_dict_key_per_rank: str = "__metric_state_per_rank"
+
+    def state_dict(self) -> OrderedDict:
+        """Method returns state dict with attributes of the metric specified in its
+        `_state_dict_all_req_keys` attribute. Can be used to save internal state of the class.
+        """
+        state = self._state_dict_per_rank()
+
+        if idist.get_world_size() > 1:
+            return OrderedDict([(Metric.__state_dict_key_per_rank, idist.all_gather(state))])
+        return OrderedDict([(Metric.__state_dict_key_per_rank, [state])])
+
+    def _load_state_dict_per_rank(self, state_dict: Mapping) -> None:
+        super().load_state_dict(state_dict)
+
+        def func(x: Any, y: Any) -> None:
+            if isinstance(x, Metric):
+                x._load_state_dict_per_rank(y)
+            elif isinstance(x, _CollectionItem):
+                value = x.value()
+                if y is None or isinstance(y, _CollectionItem.types_as_collection_item):
+                    x.load_value(y)
+                elif isinstance(value, Metric):
+                    value._load_state_dict_per_rank(y)
+                else:
+                    raise ValueError(f"Unsupported type for provided state_dict data: {type(y)}")
+
+        for attr_name in self._state_dict_all_req_keys:
+            attr = getattr(self, attr_name)
+            attr = _CollectionItem.wrap(self.__dict__, attr_name, attr)
+            _tree_apply2(func, attr, state_dict[attr_name])
+
+    def load_state_dict(self, state_dict: Mapping) -> None:
+        """Method replaces internal state of the class with provided state dict data.
+
+        If there's an active distributed configuration, the process uses its rank to pick the proper value from
+        the list of values saved under each attribute's name in the dict.
+
+        Args:
+            state_dict: a dict containing attributes of the metric specified in its `_state_dict_all_req_keys`
+                attribute.
+        """
+        if not isinstance(state_dict, Mapping):
+            raise TypeError(f"Argument state_dict should be a dictionary, but given {type(state_dict)}")
+
+        if not (len(state_dict) == 1 and Metric.__state_dict_key_per_rank in state_dict):
+            raise ValueError(
+                "Incorrect state_dict object. Argument state_dict should be a dictionary "
+                "provided by Metric.state_dict(). "
+                f"Expected single key: {Metric.__state_dict_key_per_rank}, but given {state_dict.keys()}"
+            )
+
+        list_state_dicts_per_rank = state_dict[Metric.__state_dict_key_per_rank]
+        rank = idist.get_rank()
+        world_size = idist.get_world_size()
+        if len(list_state_dicts_per_rank) != world_size:
+            raise ValueError(
+                "Incorrect state_dict object. Argument state_dict should be a dictionary "
+                "provided by Metric.state_dict(). "
+                f"Expected a list of state_dicts of size equal world_size: {world_size}, "
+                f"but got {len(list_state_dicts_per_rank)}"
+            )
+
+        state_dict = list_state_dicts_per_rank[rank]
+        self._load_state_dict_per_rank(state_dict)
+
     def __add__(self, other: Any) -> "MetricsLambda":
         from ignite.metrics.metrics_lambda import MetricsLambda
 
@@ -484,12 +674,12 @@ def __rmul__(self, other: Any) -> "MetricsLambda":
     def __pow__(self, other: Any) -> "MetricsLambda":
         from ignite.metrics.metrics_lambda import MetricsLambda
 
-        return MetricsLambda(lambda x, y: x ** y, self, other)
+        return MetricsLambda(lambda x, y: x**y, self, other)
 
     def __rpow__(self, other: Any) -> "MetricsLambda":
         from ignite.metrics.metrics_lambda import MetricsLambda
 
-        return MetricsLambda(lambda x, y: x ** y, other, self)
+        return MetricsLambda(lambda x, y: x**y, other, self)
 
     def __mod__(self, other: Any) -> "MetricsLambda":
         from ignite.metrics.metrics_lambda import MetricsLambda
diff --git a/ignite/metrics/metrics_lambda.py b/ignite/metrics/metrics_lambda.py
index 36e9f1a26eb0..6308e3871380 100644
--- a/ignite/metrics/metrics_lambda.py
+++ b/ignite/metrics/metrics_lambda.py
@@ -90,9 +90,11 @@ def Fbeta(r, p, beta):
             assert not precision.is_attached(engine)
     """
 
+    _state_dict_all_req_keys = ("_updated", "args", "kwargs")
+
     def __init__(self, f: Callable, *args: Any, **kwargs: Any) -> None:
         self.function = f
-        self.args = args
+        self.args = list(args)  # we need args to be a list instead of a tuple for state_dict/load_state_dict feature
         self.kwargs = kwargs
         self.engine: Optional[Engine] = None
         self._updated = False
diff --git a/ignite/metrics/multilabel_confusion_matrix.py b/ignite/metrics/multilabel_confusion_matrix.py
index fef9ad5ac467..2a7b25d68c67 100644
--- a/ignite/metrics/multilabel_confusion_matrix.py
+++ b/ignite/metrics/multilabel_confusion_matrix.py
@@ -81,6 +81,8 @@ class MultiLabelConfusionMatrix(Metric):
 
     """
 
+    _state_dict_all_req_keys = ("confusion_matrix", "_num_examples")
+
     def __init__(
         self,
         num_classes: int,
@@ -164,8 +166,8 @@ def _check_input(self, output: Sequence[torch.Tensor]) -> None:
         if y.dtype not in valid_types:
             raise ValueError(f"y must be of any type: {valid_types}")
 
-        if not torch.equal(y_pred, y_pred ** 2):
+        if not torch.equal(y_pred, y_pred**2):
             raise ValueError("y_pred must be a binary tensor")
 
-        if not torch.equal(y, y ** 2):
+        if not torch.equal(y, y**2):
             raise ValueError("y must be a binary tensor")
diff --git a/ignite/metrics/nlp/bleu.py b/ignite/metrics/nlp/bleu.py
index 7c84c7915d19..ed3b14b4dc52 100644
--- a/ignite/metrics/nlp/bleu.py
+++ b/ignite/metrics/nlp/bleu.py
@@ -49,7 +49,6 @@ def smooth2(numerators: torch.Tensor, denominators: torch.Tensor) -> Sequence[fl
 
     @staticmethod
     def _smooth2(numerators: torch.Tensor, denominators: torch.Tensor) -> Sequence[float]:
-
         return [
             (n.item() + 1) / (d.item() + 1) if i != 0 else n.item() / d.item()
             for i, (n, d) in enumerate(zip(numerators, denominators))
@@ -148,6 +147,11 @@ def __init__(
             raise ValueError(f'Average must be either "macro" or "micro" (got: {average})')
         self.average = average
 
+        if average == "micro":
+            self._state_dict_all_req_keys = ("p_numerators", "p_denominators", "hyp_length_sum", "ref_length_sum")
+        else:
+            self._state_dict_all_req_keys = ("_sum_of_bleu", "_num_sentences")
+
         super(Bleu, self).__init__(output_transform=output_transform, device=device)
 
     def _n_gram_counter(
@@ -157,7 +161,6 @@ def _n_gram_counter(
         p_numerators: torch.Tensor,
         p_denominators: torch.Tensor,
     ) -> Tuple[int, int]:
-
         if len(references) != len(candidates):
             raise ValueError(
                 f"nb of candidates should be equal to nb of reference lists ({len(candidates)} != "
@@ -187,7 +190,6 @@ def _n_gram_counter(
     def _brevity_penalty_smoothing(
         self, p_numerators: torch.Tensor, p_denominators: torch.Tensor, hyp_length_sum: int, ref_length_sum: int
     ) -> float:
-
         # Returns 0 if there's no matching n-grams
         # We only need to check for p_numerators[1] == 0, since if there's
         # no unigrams, there won't be any higher order ngrams.
@@ -216,7 +218,6 @@ def _sentence_bleu(self, references: Sequence[Sequence[Any]], candidates: Sequen
         return self._corpus_bleu([references], [candidates])
 
     def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates: Sequence[Sequence[Any]]) -> float:
-
         p_numerators: torch.Tensor = torch.zeros(self.ngrams_order + 1)
         p_denominators: torch.Tensor = torch.zeros(self.ngrams_order + 1)
 
@@ -234,7 +235,6 @@ def _corpus_bleu(self, references: Sequence[Sequence[Sequence[Any]]], candidates
 
     @reinit__is_reduced
     def reset(self) -> None:
-
         if self.average == "macro":
             self._sum_of_bleu = torch.tensor(0.0, dtype=torch.double, device=self._device)
             self._num_sentences = 0
@@ -270,7 +270,6 @@ def _compute_macro(self) -> torch.Tensor:
 
     @sync_all_reduce("p_numerators", "p_denominators", "hyp_length_sum", "ref_length_sum")
     def _compute_micro(self) -> float:
-
         bleu_score = self._brevity_penalty_smoothing(
             p_numerators=self.p_numerators,
             p_denominators=self.p_denominators,
diff --git a/ignite/metrics/nlp/rouge.py b/ignite/metrics/nlp/rouge.py
index 93076d4639c3..9aa87a269e61 100644
--- a/ignite/metrics/nlp/rouge.py
+++ b/ignite/metrics/nlp/rouge.py
@@ -119,6 +119,8 @@ class _BaseRouge(Metric):
     Rouge interface for Rouge-L and Rouge-N
     """
 
+    _state_dict_all_req_keys = ("_recall", "_precision", "_fmeasure", "_num_examples")
+
     def __init__(
         self,
         multiref: str = "average",
@@ -378,6 +380,8 @@ class Rouge(Metric):
         ``update`` method has changed and now works on batch of inputs.
     """
 
+    _state_dict_all_req_keys = ("internal_metrics",)
+
     def __init__(
         self,
         variants: Optional[Sequence[Union[str, int]]] = None,
diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index 090651720a7f..31fbd42b19b4 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -13,6 +13,8 @@
 
 
 class _BasePrecisionRecall(_BaseClassification):
+    _state_dict_all_req_keys = ("_numerator", "_denominator", "_weight", "_updated")
+
     def __init__(
         self,
         output_transform: Callable = lambda x: x,
@@ -20,7 +22,6 @@ def __init__(
         is_multilabel: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
     ):
-
         if not (average is None or isinstance(average, bool) or average in ["macro", "micro", "weighted", "samples"]):
             raise ValueError(
                 "Argument average should be None or a boolean or one of values"
@@ -59,12 +60,11 @@ def _prepare_output(self, output: Sequence[torch.Tensor]) -> Sequence[torch.Tens
         y_pred, y = output[0].detach(), output[1].detach()
 
         if self._type == "binary" or self._type == "multiclass":
-
             num_classes = 2 if self._type == "binary" else y_pred.size(1)
             if self._type == "multiclass" and y.max() + 1 > num_classes:
                 raise ValueError(
-                    f"y_pred contains less classes than y. Number of predicted classes is {num_classes}"
-                    f" and element in y has invalid class = {y.max().item() + 1}."
+                    f"y_pred contains fewer classes than y. Number of classes in the prediction is {num_classes}"
+                    f" and an element in y has invalid class = {y.max().item() + 1}."
                 )
             y = y.view(-1)
             if self._type == "binary" and self._average is False:
@@ -88,31 +88,32 @@ def _prepare_output(self, output: Sequence[torch.Tensor]) -> Sequence[torch.Tens
 
     @reinit__is_reduced
     def reset(self) -> None:
-
-        # `numerator`, `denominator` and `weight` are three variables chosen to be abstract
-        # representatives of the ones that are measured for cases with different `average` parameters.
-        # `weight` is only used when `average='weighted'`. Actual value of these three variables is
-        # as follows.
-        #
-        # average='samples':
-        #   numerator (torch.Tensor): sum of metric value for samples
-        #   denominator (int): number of samples
-        #
-        # average='weighted':
-        #   numerator (torch.Tensor): number of true positives per class/label
-        #   denominator (torch.Tensor): number of predicted(for precision) or actual(for recall)
-        #     positives per class/label
-        #   weight (torch.Tensor): number of actual positives per class
-        #
-        # average='micro':
-        #   numerator (torch.Tensor): sum of number of true positives for classes/labels
-        #   denominator (torch.Tensor): sum of number of predicted(for precision) or actual(for recall) positives
-        #     for classes/labels
-        #
-        # average='macro' or boolean or None:
-        #   numerator (torch.Tensor): number of true positives per class/label
-        #   denominator (torch.Tensor): number of predicted(for precision) or actual(for recall)
-        #     positives per class/label
+        """
+        `numerator`, `denominator` and `weight` are three variables chosen to be abstract
+        representatives of the ones that are measured for cases with different `average` parameters.
+        `weight` is only used when `average='weighted'`. Actual value of these three variables is
+        as follows.
+
+        average='samples':
+          numerator (torch.Tensor): sum of metric value for samples
+          denominator (int): number of samples
+
+        average='weighted':
+          numerator (torch.Tensor): number of true positives per class/label
+          denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) positives per
+            class/label.
+          weight (torch.Tensor): number of actual positives per class
+
+        average='micro':
+          numerator (torch.Tensor): sum of number of true positives for classes/labels
+          denominator (torch.Tensor): sum of number of predicted(for precision) or actual(for recall) positives for
+            classes/labels.
+
+        average='macro' or boolean or None:
+          numerator (torch.Tensor): number of true positives per class/label
+          denominator (torch.Tensor): number of predicted(for precision) or actual(for recall) positives per
+            class/label.
+        """
 
         self._numerator: Union[int, torch.Tensor] = 0
         self._denominator: Union[int, torch.Tensor] = 0
@@ -123,17 +124,20 @@ def reset(self) -> None:
 
     @sync_all_reduce("_numerator", "_denominator")
     def compute(self) -> Union[torch.Tensor, float]:
+        r"""
+        Return value of the metric for `average` options `'weighted'` and `'macro'` is computed as follows.
+
+        .. math::
+            \text{Precision/Recall} = \frac{ numerator }{ denominator } \cdot weight
 
-        # Return value of the metric for `average` options `'weighted'` and `'macro'` is computed as follows.
-        #
-        # .. math:: \text{Precision/Recall} = \frac{ numerator }{ denominator } \cdot weight
-        #
-        # wherein `weight` is the internal variable `weight` for `'weighted'` option and :math:`1/C`
-        # for the `macro` one. :math:`C` is the number of classes/labels.
-        #
-        # Return value of the metric for `average` options `'micro'`, `'samples'`, `False` and None is as follows.
-        #
-        # .. math:: \text{Precision/Recall} = \frac{ numerator }{ denominator }
+        wherein `weight` is the internal variable `_weight` for `'weighted'` option and :math:`1/C`
+        for the `macro` one. :math:`C` is the number of classes/labels.
+
+        Return value of the metric for `average` options `'micro'`, `'samples'`, `False` and None is as follows.
+
+        .. math::
+            \text{Precision/Recall} = \frac{ numerator }{ denominator }
+        """
 
         if not self._updated:
             raise NotComputableError(
@@ -371,22 +375,46 @@ def thresholded_output_transform(output):
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
+        r"""
+        Update the metric state using prediction and target.
+
+        Args:
+            output: a binary tuple of tensors (y_pred, y) whose shapes follow the table below. N stands for the batch
+                dimension, `...` for possible additional dimensions and C for class dimension.
+
+                .. list-table::
+                    :widths: 20 10 10 10
+                    :header-rows: 1
+
+                    * - Output member\\Data type
+                      - Binary
+                      - Multiclass
+                      - Multilabel
+                    * - y_pred
+                      - (N, ...)
+                      - (N, C, ...)
+                      - (N, C, ...)
+                    * - y
+                      - (N, ...)
+                      - (N, ...)
+                      - (N, C, ...)
+
+                For binary and multilabel data, both y and y_pred should consist of 0's and 1's, but for multiclass
+                data, y_pred and y should consist of probabilities and integers respectively.
+        """
         self._check_shape(output)
         self._check_type(output)
         y_pred, y, correct = self._prepare_output(output)
 
         if self._average == "samples":
-
             all_positives = y_pred.sum(dim=1)
             true_positives = correct.sum(dim=1)
             self._numerator += torch.sum(true_positives / (all_positives + self.eps))
             self._denominator += y.size(0)
         elif self._average == "micro":
-
             self._denominator += y_pred.sum()
             self._numerator += correct.sum()
         else:  # _average in [False, None, 'macro', 'weighted']
-
             self._denominator += y_pred.sum(dim=0)
             self._numerator += correct.sum(dim=0)
 
diff --git a/ignite/metrics/psnr.py b/ignite/metrics/psnr.py
index f4dc59669afc..4251a24f8f13 100644
--- a/ignite/metrics/psnr.py
+++ b/ignite/metrics/psnr.py
@@ -81,6 +81,8 @@ def get_y_channel(output):
     .. versionadded:: 0.4.3
     """
 
+    _state_dict_all_req_keys = ("_sum_of_batchwise_psnr", "_num_examples")
+
     def __init__(
         self,
         data_range: Union[int, float],
@@ -114,7 +116,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
 
         dim = tuple(range(1, y.ndim))
         mse_error = torch.pow(y_pred.double() - y.view_as(y_pred).double(), 2).mean(dim=dim)
-        self._sum_of_batchwise_psnr += torch.sum(10.0 * torch.log10(self.data_range ** 2 / (mse_error + 1e-10))).to(
+        self._sum_of_batchwise_psnr += torch.sum(10.0 * torch.log10(self.data_range**2 / (mse_error + 1e-10))).to(
             device=self._device
         )
         self._num_examples += y.shape[0]
diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index 77afdb8a1d1b..b570951e291f 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -221,17 +221,14 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
         _, y, correct = self._prepare_output(output)
 
         if self._average == "samples":
-
             actual_positives = y.sum(dim=1)
             true_positives = correct.sum(dim=1)
             self._numerator += torch.sum(true_positives / (actual_positives + self.eps))
             self._denominator += y.size(0)
         elif self._average == "micro":
-
             self._denominator += y.sum()
             self._numerator += correct.sum()
         else:  # _average in [False, 'macro', 'weighted']
-
             self._denominator += y.sum(dim=0)
             self._numerator += correct.sum(dim=0)
 
diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py
index 468838a9908c..9b3b4efb4f3f 100644
--- a/ignite/metrics/running_average.py
+++ b/ignite/metrics/running_average.py
@@ -1,10 +1,11 @@
-from typing import Callable, cast, Optional, Sequence, Union
+import warnings
+from typing import Any, Callable, cast, Optional, Union
 
 import torch
 
 import ignite.distributed as idist
 from ignite.engine import Engine, Events
-from ignite.metrics.metric import EpochWise, Metric, MetricUsage, reinit__is_reduced, sync_all_reduce
+from ignite.metrics.metric import Metric, MetricUsage, reinit__is_reduced, RunningBatchWise, SingleEpochRunningBatchWise
 
 __all__ = ["RunningAverage"]
 
@@ -18,8 +19,10 @@ class RunningAverage(Metric):
         alpha: running average decay factor, default 0.98
         output_transform: a function to use to transform the output if `src` is None and
             corresponds the output of process function. Otherwise it should be None.
-        epoch_bound: whether the running average should be reset after each epoch (defaults
-            to True).
+        epoch_bound: whether the running average should be reset after each epoch. It is depracated in favor of
+            ``usage`` argument in :meth:`attach` method. Setting ``epoch_bound`` to ``False`` is equivalent to
+            ``usage=SingleEpochRunningBatchWise()`` and setting it to ``True`` is equivalent to
+            ``usage=RunningBatchWise()`` in the :meth:`attach` method. Default None.
         device: specifies which device updates are accumulated on. Should be
             None when ``src`` is an instance of :class:`~ignite.metrics.metric.Metric`, as the running average will
             use the ``src``'s device. Otherwise, defaults to CPU. Only applicable when the computed value
@@ -84,13 +87,14 @@ def log_running_avg_metrics():
     """
 
     required_output_keys = None
+    _state_dict_all_req_keys = ("_value", "src")
 
     def __init__(
         self,
         src: Optional[Metric] = None,
         alpha: float = 0.98,
         output_transform: Optional[Callable] = None,
-        epoch_bound: bool = True,
+        epoch_bound: Optional[bool] = None,
         device: Optional[Union[str, torch.device]] = None,
     ):
         if not (isinstance(src, Metric) or src is None):
@@ -101,11 +105,13 @@ def __init__(
         if isinstance(src, Metric):
             if output_transform is not None:
                 raise ValueError("Argument output_transform should be None if src is a Metric.")
+
+            def output_transform(x: Any) -> Any:
+                return x
+
             if device is not None:
                 raise ValueError("Argument device should be None if src is a Metric.")
-            self.src = src
-            self._get_src_value = self._get_metric_value
-            setattr(self, "iteration_completed", self._metric_iteration_completed)
+            self.src: Union[Metric, None] = src
             device = src._device
         else:
             if output_transform is None:
@@ -113,56 +119,105 @@ def __init__(
                     "Argument output_transform should not be None if src corresponds "
                     "to the output of process function."
                 )
-            self._get_src_value = self._get_output_value
-            setattr(self, "update", self._output_update)
+            self.src = None
             if device is None:
                 device = torch.device("cpu")
 
-        self.alpha = alpha
+        if epoch_bound is not None:
+            warnings.warn(
+                "`epoch_bound` is deprecated and will be removed in the future. Consider using `usage` argument of"
+                "`attach` method instead. `epoch_bound=True` is equivalent with `usage=SingleEpochRunningBatchWise()`"
+                " and `epoch_bound=False` is equivalent with `usage=RunningBatchWise()`."
+            )
         self.epoch_bound = epoch_bound
-        super(RunningAverage, self).__init__(output_transform=output_transform, device=device)  # type: ignore[arg-type]
+        self.alpha = alpha
+        super(RunningAverage, self).__init__(output_transform=output_transform, device=device)
 
     @reinit__is_reduced
     def reset(self) -> None:
         self._value: Optional[Union[float, torch.Tensor]] = None
+        if isinstance(self.src, Metric):
+            self.src.reset()
 
     @reinit__is_reduced
-    def update(self, output: Sequence) -> None:
-        # Implement abstract method
-        pass
+    def update(self, output: Union[torch.Tensor, float]) -> None:
+        if self.src is None:
+            output = output.detach().to(self._device, copy=True) if isinstance(output, torch.Tensor) else output
+            value = idist.all_reduce(output) / idist.get_world_size()
+        else:
+            value = self.src.compute()
+            self.src.reset()
 
-    def compute(self) -> Union[torch.Tensor, float]:
         if self._value is None:
-            self._value = self._get_src_value()
+            self._value = value
         else:
-            self._value = self._value * self.alpha + (1.0 - self.alpha) * self._get_src_value()
-
-        return self._value
+            self._value = self._value * self.alpha + (1.0 - self.alpha) * value
 
-    def attach(self, engine: Engine, name: str, _usage: Union[str, MetricUsage] = EpochWise()) -> None:
-        if self.epoch_bound:
-            # restart average every epoch
-            engine.add_event_handler(Events.EPOCH_STARTED, self.started)
-        # compute metric
-        engine.add_event_handler(Events.ITERATION_COMPLETED, self.iteration_completed)
-        # apply running average
-        engine.add_event_handler(Events.ITERATION_COMPLETED, self.completed, name)
-
-    def _get_metric_value(self) -> Union[torch.Tensor, float]:
-        return self.src.compute()
-
-    @sync_all_reduce("src")
-    def _get_output_value(self) -> Union[torch.Tensor, float]:
-        # we need to compute average instead of sum produced by @sync_all_reduce("src")
-        output = cast(Union[torch.Tensor, float], self.src) / idist.get_world_size()
-        return output
-
-    def _metric_iteration_completed(self, engine: Engine) -> None:
-        self.src.started(engine)
-        self.src.iteration_completed(engine)
-
-    @reinit__is_reduced
-    def _output_update(self, output: Union[torch.Tensor, float]) -> None:
-        if isinstance(output, torch.Tensor):
-            output = output.detach().to(self._device, copy=True)
-        self.src = output  # type: ignore[assignment]
+    def compute(self) -> Union[torch.Tensor, float]:
+        return cast(Union[torch.Tensor, float], self._value)
+
+    def attach(self, engine: Engine, name: str, usage: Union[str, MetricUsage] = RunningBatchWise()) -> None:
+        r"""
+        Attach the metric to the ``engine`` using the events determined by the ``usage``.
+
+        Args:
+            engine: the engine to get attached to.
+            name: by which, the metric is inserted into ``engine.state.metrics`` dictionary.
+            usage: the usage determining on which events the metric is reset, updated and computed. It should be an
+                instance of the :class:`~ignite.metrics.metric.MetricUsage`\ s in the following table.
+
+                ======================================================= ===========================================
+                ``usage`` **class**                                     **Description**
+                ======================================================= ===========================================
+                :class:`~.metrics.metric.RunningBatchWise`              Running average of the ``src`` metric or
+                                                                        ``engine.state.output`` is computed across
+                                                                        batches. In the former case, on each batch,
+                                                                        ``src`` is reset, updated and computed then
+                                                                        its value is retrieved. Default.
+                :class:`~.metrics.metric.SingleEpochRunningBatchWise`   Same as above but the running average is
+                                                                        computed across batches in an epoch so it
+                                                                        is reset at the end of the epoch.
+                :class:`~.metrics.metric.RunningEpochWise`              Running average of the ``src`` metric or
+                                                                        ``engine.state.output`` is computed across
+                                                                        epochs. In the former case, ``src`` works
+                                                                        as if it was attached in a
+                                                                        :class:`~ignite.metrics.metric.EpochWise`
+                                                                        manner and its computed value is retrieved
+                                                                        at the end of the epoch. The latter case
+                                                                        doesn't make much sense for this usage as
+                                                                        the ``engine.state.output`` of the last
+                                                                        batch is retrieved then.
+                ======================================================= ===========================================
+
+        ``RunningAverage`` retrieves ``engine.state.output`` at ``usage.ITERATION_COMPLETED`` if the ``src`` is not
+        given and it's computed and updated using ``src``, by manually calling its ``compute`` method, or
+        ``engine.state.output`` at ``usage.COMPLETED`` event.
+        Also if ``src`` is given, it is updated at ``usage.ITERATION_COMPLETED``, but its reset event is determined by
+        ``usage`` type. If ``isinstance(usage, BatchWise)`` holds true, ``src`` is reset on ``BatchWise().STARTED``,
+        otherwise on ``EpochWise().STARTED`` if ``isinstance(usage, EpochWise)``.
+
+        .. versionchanged:: 0.5.1
+            Added `usage` argument
+        """
+        usage = self._check_usage(usage)
+        if self.epoch_bound is not None:
+            usage = SingleEpochRunningBatchWise() if self.epoch_bound else RunningBatchWise()
+
+        if isinstance(self.src, Metric) and not engine.has_event_handler(
+            self.src.iteration_completed, Events.ITERATION_COMPLETED
+        ):
+            engine.add_event_handler(Events.ITERATION_COMPLETED, self.src.iteration_completed)
+
+        super().attach(engine, name, usage)
+
+    def detach(self, engine: Engine, usage: Union[str, MetricUsage] = RunningBatchWise()) -> None:
+        usage = self._check_usage(usage)
+        if self.epoch_bound is not None:
+            usage = SingleEpochRunningBatchWise() if self.epoch_bound else RunningBatchWise()
+
+        if isinstance(self.src, Metric) and engine.has_event_handler(
+            self.src.iteration_completed, Events.ITERATION_COMPLETED
+        ):
+            engine.remove_event_handler(self.src.iteration_completed, Events.ITERATION_COMPLETED)
+
+        super().detach(engine, usage)
diff --git a/ignite/metrics/ssim.py b/ignite/metrics/ssim.py
index 805024c0fd23..6824c0b3f374 100644
--- a/ignite/metrics/ssim.py
+++ b/ignite/metrics/ssim.py
@@ -1,4 +1,5 @@
-from typing import Callable, Sequence, Union
+import warnings
+from typing import Callable, Optional, Sequence, Union
 
 import torch
 import torch.nn.functional as F
@@ -11,9 +12,12 @@
 
 class SSIM(Metric):
     """
-    Computes Structual Similarity Index Measure
+    Computes Structural Similarity Index Measure
 
-    - ``update`` must receive output of the form ``(y_pred, y)``.
+    - ``update`` must receive output of the form ``(y_pred, y)``. They have to be of the same type.
+        Valid :class:`torch.dtype` are the following:
+        - on CPU: `torch.float32`, `torch.float64`.
+        - on CUDA: `torch.float16`, `torch.bfloat16`, `torch.float32`, `torch.float64`.
 
     Args:
         data_range: Range of the image. Typically, ``1.0`` or ``255``.
@@ -60,6 +64,8 @@ class SSIM(Metric):
     .. versionadded:: 0.4.2
     """
 
+    _state_dict_all_req_keys = ("_sum_of_ssim", "_num_examples", "_kernel")
+
     def __init__(
         self,
         data_range: Union[int, float],
@@ -93,27 +99,27 @@ def __init__(
 
         super(SSIM, self).__init__(output_transform=output_transform, device=device)
         self.gaussian = gaussian
+        self.data_range = data_range
         self.c1 = (k1 * data_range) ** 2
         self.c2 = (k2 * data_range) ** 2
         self.pad_h = (self.kernel_size[0] - 1) // 2
         self.pad_w = (self.kernel_size[1] - 1) // 2
-        self._kernel = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma)
+        self._kernel_2d = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma)
+        self._kernel: Optional[torch.Tensor] = None
 
     @reinit__is_reduced
     def reset(self) -> None:
         self._sum_of_ssim = torch.tensor(0.0, dtype=torch.float64, device=self._device)
         self._num_examples = 0
-        self._kernel = self._gaussian_or_uniform_kernel(kernel_size=self.kernel_size, sigma=self.sigma)
 
     def _uniform(self, kernel_size: int) -> torch.Tensor:
-        max, min = 2.5, -2.5
-        ksize_half = (kernel_size - 1) * 0.5
-        kernel = torch.linspace(-ksize_half, ksize_half, steps=kernel_size, device=self._device)
-        for i, j in enumerate(kernel):
-            if min <= j <= max:
-                kernel[i] = 1 / (max - min)
-            else:
-                kernel[i] = 0
+        kernel = torch.zeros(kernel_size)
+
+        start_uniform_index = max(kernel_size // 2 - 2, 0)
+        end_uniform_index = min(kernel_size // 2 + 3, kernel_size)
+
+        min_, max_ = -2.5, 2.5
+        kernel[start_uniform_index:end_uniform_index] = 1 / (max_ - min_)
 
         return kernel.unsqueeze(dim=0)  # (1, kernel_size)
 
@@ -152,15 +158,37 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
                 f"Expected y_pred and y to have BxCxHxW shape. Got y_pred: {y_pred.shape} and y: {y.shape}."
             )
 
-        channel = y_pred.size(1)
-        if len(self._kernel.shape) < 4:
-            self._kernel = self._kernel.expand(channel, 1, -1, -1).to(device=y_pred.device)
+        # converts potential integer tensor to fp
+        if not y.is_floating_point():
+            y = y.float()
+        if not y_pred.is_floating_point():
+            y_pred = y_pred.float()
+
+        nb_channel = y_pred.size(1)
+        if self._kernel is None or self._kernel.shape[0] != nb_channel:
+            self._kernel = self._kernel_2d.expand(nb_channel, 1, -1, -1)
+
+        if y_pred.device != self._kernel.device:
+            if self._kernel.device == torch.device("cpu"):
+                self._kernel = self._kernel.to(device=y_pred.device)
+
+            elif y_pred.device == torch.device("cpu"):
+                warnings.warn(
+                    "y_pred tensor is on cpu device but previous computation was on another device: "
+                    f"{self._kernel.device}. To avoid having a performance hit, please ensure that all "
+                    "y and y_pred tensors are on the same device.",
+                )
+                y_pred = y_pred.to(device=self._kernel.device)
+                y = y.to(device=self._kernel.device)
 
         y_pred = F.pad(y_pred, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect")
         y = F.pad(y, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect")
 
+        if y_pred.dtype != self._kernel.dtype:
+            self._kernel = self._kernel.to(dtype=y_pred.dtype)
+
         input_list = [y_pred, y, y_pred * y_pred, y * y, y_pred * y]
-        outputs = F.conv2d(torch.cat(input_list), self._kernel, groups=channel)
+        outputs = F.conv2d(torch.cat(input_list), self._kernel, groups=nb_channel)
         batch_size = y_pred.size(0)
         output_list = [outputs[x * batch_size : (x + 1) * batch_size] for x in range(len(input_list))]
 
@@ -178,7 +206,7 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
         b2 = sigma_pred_sq + sigma_target_sq + self.c2
 
         ssim_idx = (a1 * a2) / (b1 * b2)
-        self._sum_of_ssim += torch.mean(ssim_idx, (1, 2, 3), dtype=torch.float64).sum().to(self._device)
+        self._sum_of_ssim += torch.mean(ssim_idx, (1, 2, 3), dtype=torch.float64).sum().to(device=self._device)
 
         self._num_examples += y.shape[0]
 
diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index 8f4515201976..87da4c868731 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -73,6 +73,8 @@ def one_hot_to_binary_output_transform(output):
             0.75
     """
 
+    _state_dict_all_req_keys = ("_num_correct", "_num_examples")
+
     def __init__(
         self,
         k: int = 5,
diff --git a/ignite/utils.py b/ignite/utils.py
index fffe2c1b3a9a..078e16663f68 100644
--- a/ignite/utils.py
+++ b/ignite/utils.py
@@ -6,7 +6,7 @@
 import shutil
 import warnings
 from pathlib import Path
-from typing import Any, Callable, cast, Dict, Optional, TextIO, Tuple, Type, TypeVar, Union
+from typing import Any, Callable, cast, Dict, List, Optional, TextIO, Tuple, Type, TypeVar, Union
 
 import torch
 
@@ -78,6 +78,66 @@ def apply_to_type(
     raise TypeError((f"x must contain {input_type}, dicts or lists; found {type(x)}"))
 
 
+def _tree_map(
+    func: Callable, x: Union[Any, collections.Sequence, collections.Mapping], key: Optional[Union[int, str]] = None
+) -> Union[Any, collections.Sequence, collections.Mapping]:
+    if isinstance(x, collections.Mapping):
+        return cast(Callable, type(x))({k: _tree_map(func, sample, key=k) for k, sample in x.items()})
+    if isinstance(x, tuple) and hasattr(x, "_fields"):  # namedtuple
+        return cast(Callable, type(x))(*(_tree_map(func, sample) for sample in x))
+    if isinstance(x, collections.Sequence):
+        return cast(Callable, type(x))([_tree_map(func, sample, key=i) for i, sample in enumerate(x)])
+    return func(x, key=key)
+
+
+class _CollectionItem:
+    types_as_collection_item: Tuple = (int, float, torch.Tensor)
+
+    def __init__(self, collection: Union[Dict, List], key: Union[int, str]) -> None:
+        if not isinstance(collection, (dict, list)):
+            raise TypeError(
+                f"Input type is expected to be a mapping or list, but got {type(collection)} " f"for input key '{key}'."
+            )
+        if isinstance(collection, list) and isinstance(key, str):
+            raise ValueError("Key should be int for collection of type list")
+
+        self.collection = collection
+        self.key = key
+
+    def load_value(self, value: Any) -> None:
+        self.collection[self.key] = value  # type: ignore[index]
+
+    def value(self) -> Any:
+        return self.collection[self.key]  # type: ignore[index]
+
+    @staticmethod
+    def wrap(object: Union[Dict, List], key: Union[int, str], value: Any) -> Union[Any, "_CollectionItem"]:
+        return (
+            _CollectionItem(object, key)
+            if value is None or isinstance(value, _CollectionItem.types_as_collection_item)
+            else value
+        )
+
+
+def _tree_apply2(
+    func: Callable,
+    x: Union[Any, List, Dict],
+    y: Union[Any, collections.Sequence, collections.Mapping],
+) -> None:
+    if isinstance(x, dict) and isinstance(y, collections.Mapping):
+        for k, v in x.items():
+            if k not in y:
+                raise ValueError(f"Key '{k}' from x is not found in y: {y.keys()}")
+            _tree_apply2(func, _CollectionItem.wrap(x, k, v), y[k])
+    elif isinstance(x, list) and isinstance(y, collections.Sequence):
+        if len(x) != len(y):
+            raise ValueError(f"Size of y: {len(y)} does not match the size of x: '{len(x)}'")
+        for i, (v1, v2) in enumerate(zip(x, y)):
+            _tree_apply2(func, _CollectionItem.wrap(x, i, v1), v2)
+    else:
+        return func(x, y)
+
+
 def to_onehot(indices: torch.Tensor, num_classes: int) -> torch.Tensor:
     """Convert a tensor of indices of any shape `(N, ...)` to a
     tensor of one-hot indicators of shape `(N, num_classes, ...)` and of type uint8. Output's device is equal to the
@@ -182,13 +242,11 @@ def setup_logger(
 
     # Remove previous handlers
     if distributed_rank > 0 or reset:
-
         if logger.hasHandlers():
             for h in list(logger.handlers):
                 logger.removeHandler(h)
 
     if distributed_rank > 0:
-
         # Add null handler to avoid multiple parallel messages
         logger.addHandler(logging.NullHandler())
 
@@ -254,7 +312,6 @@ def manual_seed(seed: int) -> None:
 def deprecated(
     deprecated_in: str, removed_in: str = "", reasons: Tuple[str, ...] = (), raise_exception: bool = False
 ) -> Callable:
-
     F = TypeVar("F", bound=Callable[..., Any])
 
     def decorator(func: F) -> F:
diff --git a/mypy.ini b/mypy.ini
index 489b3a3fd28c..bf91c5787738 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -77,3 +77,6 @@ ignore_missing_imports = True
 
 [mypy-torchvision.*]
 ignore_missing_imports = True
+
+[mypy-ignite.contrib.handlers.custom_events]
+ignore_errors = True
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 182a4057bc17..cc5db446522b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -12,12 +12,8 @@ tqdm
 scikit-learn
 matplotlib
 tensorboardX
-visdom==0.2.3
-# temporary fix for
-# ImportError: cannot import name 'soft_unicode' from 'markupsafe'
-markupsafe==2.0.1
+visdom
 polyaxon
-polyaxon-client
 wandb
 mlflow
 neptune-client>=0.16.17
@@ -31,4 +27,4 @@ nltk
 # Examples dependencies
 pandas
 gymnasium
-mkl
+mkl;platform_machine=="x86_64"
diff --git a/setup.cfg b/setup.cfg
index 73947364b717..9a6e4158fa7b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [metadata]
-license_file = LICENSE
+license_files = LICENSE
 
 [pycodestyle]
 exclude = .eggs,*.egg,build,docs/*,.git,versioneer.py,*/conf.py
diff --git a/setup.py b/setup.py
index 15fdb830a875..29b02c6fac6a 100644
--- a/setup.py
+++ b/setup.py
@@ -30,8 +30,8 @@ def find_version(*file_paths):
     # Metadata
     name="pytorch-ignite",
     version=VERSION,
-    author="PyTorch Core Team",
-    author_email="soumith@pytorch.org",
+    author="PyTorch-Ignite Team",
+    author_email="contact@pytorch-ignite.ai",
     url="https://github.com/pytorch/ignite",
     description="A lightweight library to help with training neural networks in PyTorch.",
     long_description_content_type="text/markdown",
diff --git a/tests/ignite/base/test_mixins.py b/tests/ignite/base/test_mixins.py
index a929e4c7eb89..0f3a39811fbb 100644
--- a/tests/ignite/base/test_mixins.py
+++ b/tests/ignite/base/test_mixins.py
@@ -10,6 +10,5 @@ def test_state_dict():
 
 
 def test_load_state_dict():
-
     s = Serializable()
     s.load_state_dict({})
diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py
index 978dfade2fbf..caf92e6e7ad2 100644
--- a/tests/ignite/conftest.py
+++ b/tests/ignite/conftest.py
@@ -83,7 +83,6 @@ def local_rank(worker_id):
 
 @pytest.fixture(scope="module")
 def world_size():
-
     remove_env_var = False
 
     if "WORLD_SIZE" not in os.environ:
@@ -98,14 +97,12 @@ def world_size():
 
 @pytest.fixture()
 def clean_env():
-
     for k in ["RANK", "LOCAL_RANK", "WORLD_SIZE"]:
         if k in os.environ:
             del os.environ[k]
 
 
 def _create_dist_context(dist_info, lrank):
-
     dist.init_process_group(**dist_info)
     dist.barrier()
     if torch.cuda.is_available():
@@ -115,7 +112,6 @@ def _create_dist_context(dist_info, lrank):
 
 
 def _destroy_dist_context():
-
     if dist.get_rank() == 0:
         # To support Python 3.7; Otherwise we could do `.unlink(missing_ok=True)`
         try:
@@ -145,7 +141,6 @@ def _find_free_port():
 
 
 def _setup_free_port(local_rank):
-
     port_file = "/tmp/free_port"
 
     if local_rank == 0:
@@ -169,7 +164,6 @@ def _setup_free_port(local_rank):
 
 @pytest.fixture()
 def distributed_context_single_node_nccl(local_rank, world_size):
-
     free_port = _setup_free_port(local_rank)
 
     dist_info = {
@@ -184,7 +178,6 @@ def distributed_context_single_node_nccl(local_rank, world_size):
 
 @pytest.fixture()
 def distributed_context_single_node_gloo(local_rank, world_size):
-
     from datetime import timedelta
 
     if sys.platform.startswith("win"):
@@ -212,7 +205,6 @@ def distributed_context_single_node_gloo(local_rank, world_size):
 
 @pytest.fixture()
 def multi_node_conf(local_rank):
-
     assert "node_id" in os.environ
     assert "nnodes" in os.environ
     assert "nproc_per_node" in os.environ
@@ -229,7 +221,6 @@ def multi_node_conf(local_rank):
 
 
 def _create_mnodes_dist_context(dist_info, mnodes_conf):
-
     dist.init_process_group(**dist_info)
     dist.barrier()
     if torch.cuda.is_available():
@@ -249,7 +240,6 @@ def _destroy_mnodes_dist_context():
 
 @pytest.fixture()
 def distributed_context_multi_node_gloo(multi_node_conf):
-
     assert "MASTER_ADDR" in os.environ
     assert "MASTER_PORT" in os.environ
 
@@ -265,7 +255,6 @@ def distributed_context_multi_node_gloo(multi_node_conf):
 
 @pytest.fixture()
 def distributed_context_multi_node_nccl(multi_node_conf):
-
     assert "MASTER_ADDR" in os.environ
     assert "MASTER_PORT" in os.environ
 
@@ -289,7 +278,6 @@ def _xla_template_worker_task(index, fn, args):
 
 
 def _xla_execute(fn, args, nprocs):
-
     import torch_xla.distributed.xla_multiprocessing as xmp
 
     spawn_kwargs = {}
@@ -409,6 +397,7 @@ def gloo_hvd_executor():
             ],
         ),
     ],
+    scope="class",
 )
 def distributed(request, local_rank, world_size):
     if request.param in ("nccl", "gloo_cpu", "gloo"):
diff --git a/tests/ignite/contrib/conftest.py b/tests/ignite/contrib/conftest.py
index 0f63292239e5..9c9b15d8699e 100644
--- a/tests/ignite/contrib/conftest.py
+++ b/tests/ignite/contrib/conftest.py
@@ -25,7 +25,6 @@ def no_site_packages(request):
 
 @pytest.fixture()
 def visdom_offline_logfile(dirname):
-
     log_file = dirname / "logs.visdom"
     yield log_file
 
@@ -41,7 +40,6 @@ def visdom_server():
     global vd_hostname, vd_port, vd_server_process
 
     if vd_server_process is None:
-
         import subprocess
         import time
 
@@ -73,7 +71,6 @@ def visdom_server():
 
 @pytest.fixture()
 def visdom_server_stop():
-
     yield None
 
     import time
diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py
index 4749d5db1086..d25cb33caceb 100644
--- a/tests/ignite/contrib/engines/test_common.py
+++ b/tests/ignite/contrib/engines/test_common.py
@@ -164,6 +164,9 @@ def test_asserts_setup_common_training_handlers():
         )
         trainer.run([1])
 
+    with pytest.warns(UserWarning, match=r"Argument device is unused and deprecated"):
+        setup_common_training_handlers(trainer, device="cpu")
+
 
 def test_no_warning_with_train_sampler(recwarn):
     from torch.utils.data import RandomSampler
diff --git a/tests/ignite/contrib/handlers/test_base_logger.py b/tests/ignite/contrib/handlers/test_base_logger.py
index 8ec6b832f26b..1e63e490da2e 100644
--- a/tests/ignite/contrib/handlers/test_base_logger.py
+++ b/tests/ignite/contrib/handlers/test_base_logger.py
@@ -1,9 +1,11 @@
+import math
 from typing import Any, Union
 from unittest.mock import call, MagicMock
 
 import pytest
 import torch
 
+from ignite.contrib.handlers import CustomPeriodicEvent
 from ignite.contrib.handlers.base_logger import (
     BaseLogger,
     BaseOptimizerParamsHandler,
@@ -49,7 +51,6 @@ def __call__(self, engine: Engine, logger: Any, event_name: Union[str, Events])
 
 
 def test_base_output_handler_wrong_setup():
-
     with pytest.raises(TypeError, match="metric_names should be either a list or equal 'all'"):
         DummyOutputHandler("tag", metric_names="abc", output_transform=None)
 
@@ -67,7 +68,6 @@ def test_base_output_handler_wrong_setup():
 
 
 def test_base_output_handler_setup_output_metrics():
-
     engine = Engine(lambda engine, batch: None)
     true_metrics = {"a": 0, "b": 1}
     engine.state = State(metrics=true_metrics)
@@ -183,7 +183,6 @@ def test_opt_params_handler_on_non_torch_optimizers():
     ],
 )
 def test_attach(event, n_calls, kwargs):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -218,7 +217,6 @@ def update_fn(engine, batch):
 
 
 def test_attach_wrong_event_name():
-
     trainer = Engine(lambda b, e: None)
     logger = DummyLogger()
     mock_log_handler = MagicMock()
@@ -238,7 +236,6 @@ def test_attach_on_custom_event():
     data = list(range(150))
 
     def _test(event, n_calls, cpe):
-
         losses = torch.rand(n_epochs * len(data))
         losses_iter = iter(losses)
 
@@ -259,6 +256,33 @@ def update_fn(engine, batch):
         mock_log_handler.assert_called_with(trainer, logger, event)
         assert mock_log_handler.call_count == n_calls
 
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        n_iterations = 10
+        cpe1 = CustomPeriodicEvent(n_iterations=n_iterations)
+        n = len(data) * n_epochs / n_iterations
+        nf = math.floor(n)
+        ns = nf + 1 if nf < n else nf
+        _test(cpe1.Events.ITERATIONS_10_STARTED, ns, cpe1)
+        _test(cpe1.Events.ITERATIONS_10_COMPLETED, nf, cpe1)
+
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        n_iterations = 15
+        cpe2 = CustomPeriodicEvent(n_iterations=n_iterations)
+        n = len(data) * n_epochs / n_iterations
+        nf = math.floor(n)
+        ns = nf + 1 if nf < n else nf
+        _test(cpe2.Events.ITERATIONS_15_STARTED, ns, cpe2)
+        _test(cpe2.Events.ITERATIONS_15_COMPLETED, nf, cpe2)
+
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        n_custom_epochs = 2
+        cpe3 = CustomPeriodicEvent(n_epochs=n_custom_epochs)
+        n = n_epochs / n_custom_epochs
+        nf = math.floor(n)
+        ns = nf + 1 if nf < n else nf
+        _test(cpe3.Events.EPOCHS_2_STARTED, ns, cpe3)
+        _test(cpe3.Events.EPOCHS_2_COMPLETED, nf, cpe3)
+
 
 @pytest.mark.parametrize(
     "event, n_calls",
@@ -273,7 +297,6 @@ def update_fn(engine, batch):
     ],
 )
 def test_as_context_manager(event, n_calls):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -313,13 +336,11 @@ def update_fn(engine, batch):
 
 
 def test_base_weights_handler_wrong_setup():
-
     with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"):
         DummyWeightsHandler(None)
 
 
 def test_base_weights_scalar_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     with pytest.raises(TypeError, match="Argument reduction should be callable"):
         DummyWeightsScalarHandler(model, reduction=123)
diff --git a/tests/ignite/contrib/handlers/test_clearml_logger.py b/tests/ignite/contrib/handlers/test_clearml_logger.py
index 2e4968f40fa5..9f29d2ba8ebb 100644
--- a/tests/ignite/contrib/handlers/test_clearml_logger.py
+++ b/tests/ignite/contrib/handlers/test_clearml_logger.py
@@ -44,7 +44,6 @@ def test_no_clearml():
 
 
 def test_optimizer_params_handler_wrong_setup():
-
     with pytest.raises(TypeError):
         OptimizerParamsHandler(optimizer=None)
 
@@ -58,7 +57,6 @@ def test_optimizer_params_handler_wrong_setup():
 
 
 def test_optimizer_params():
-
     optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01)
     wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr")
     mock_logger = MagicMock(spec=ClearMLLogger)
@@ -81,7 +79,6 @@ def test_optimizer_params():
 
 
 def test_output_handler_with_wrong_logger_type():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
 
     mock_logger = MagicMock()
@@ -91,7 +88,6 @@ def test_output_handler_with_wrong_logger_type():
 
 
 def test_output_handler_output_transform(dirname):
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
     mock_logger = MagicMock(spec=ClearMLLogger)
     mock_logger.clearml_logger = MagicMock()
@@ -118,7 +114,6 @@ def test_output_handler_output_transform(dirname):
 
 
 def test_output_handler_metric_names(dirname):
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"])
     mock_logger = MagicMock(spec=ClearMLLogger)
     mock_logger.clearml_logger = MagicMock()
@@ -216,7 +211,6 @@ def test_output_handler_metric_names(dirname):
 
 
 def test_output_handler_both(dirname):
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x})
     mock_logger = MagicMock(spec=ClearMLLogger)
     mock_logger.clearml_logger = MagicMock()
@@ -257,7 +251,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -340,7 +333,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_weights_scalar_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = WeightsScalarHandler(model)
     mock_logger = MagicMock()
@@ -350,7 +342,6 @@ def test_weights_scalar_handler_wrong_setup():
 
 
 def test_weights_scalar_handler(dummy_model_factory):
-
     model = dummy_model_factory(with_grads=True, with_frozen_layer=False)
 
     # define test wrapper to test with and without optional tag
@@ -429,7 +420,6 @@ def weight_selector(n, _):
 
 
 def test_weights_hist_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = WeightsHistHandler(model)
     mock_logger = MagicMock()
@@ -439,7 +429,6 @@ def test_weights_hist_handler_wrong_setup():
 
 
 def test_weights_hist_handler(dummy_model_factory):
-
     model = dummy_model_factory(with_grads=True, with_frozen_layer=False)
 
     # define test wrapper to test with and without optional tag
@@ -518,7 +507,6 @@ def weight_selector(n, _):
 
 
 def test_grads_scalar_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = GradsScalarHandler(model)
     mock_logger = MagicMock()
@@ -612,7 +600,6 @@ def weight_selector(n, _):
 
 
 def test_grads_hist_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = GradsHistHandler(model)
     mock_logger = MagicMock()
@@ -700,7 +687,6 @@ def weight_selector(n, _):
 
 
 def test_integration(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -728,7 +714,6 @@ def dummy_handler(engine, logger, event_name):
 
 
 def test_integration_as_context_manager(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -741,7 +726,6 @@ def update_fn(engine, batch):
     with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"):
         ClearMLLogger.set_bypass_mode(True)
         with ClearMLLogger(output_uri=dirname) as clearml_logger:
-
             trainer = Engine(update_fn)
 
             def dummy_handler(engine, logger, event_name):
@@ -755,7 +739,6 @@ def dummy_handler(engine, logger, event_name):
 
 
 def test_clearml_logger_getattr_method(dirname):
-
     with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"):
         ClearMLLogger.set_bypass_mode(True)
 
@@ -777,7 +760,6 @@ def test_clearml_logger_getattr_method(dirname):
 
 
 def test_clearml_logger_get_task_bypass(dirname):
-
     with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"):
         ClearMLLogger.set_bypass_mode(True)
 
@@ -873,7 +855,6 @@ def test_clearml_saver_callbacks():
     n_saved = 2
 
     for i, (filename, metadata) in enumerate(zip(filenames, metadata_list)):
-
         mock_model_info.upload_filename = filename
 
         if i >= n_saved:
@@ -927,7 +908,6 @@ def forward(self, x):
 
 
 def _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=False):
-
     if idist.get_rank() == 0:
         clearml.Task.current_task = MagicMock(spec=clearml.Task)
         clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock()
@@ -1016,7 +996,6 @@ def update_fn(engine, batch):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device)
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True)
@@ -1026,7 +1005,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device)
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True)
diff --git a/tests/ignite/contrib/handlers/test_custom_events.py b/tests/ignite/contrib/handlers/test_custom_events.py
new file mode 100644
index 000000000000..90decc1e2983
--- /dev/null
+++ b/tests/ignite/contrib/handlers/test_custom_events.py
@@ -0,0 +1,132 @@
+import math
+
+import pytest
+
+from ignite.contrib.handlers.custom_events import CustomPeriodicEvent
+from ignite.engine import Engine
+
+
+def test_bad_input():
+    with pytest.warns(DeprecationWarning, match=r"CustomPeriodicEvent is deprecated"):
+        with pytest.raises(TypeError, match="Argument n_iterations should be an integer"):
+            CustomPeriodicEvent(n_iterations="a")
+        with pytest.raises(ValueError, match="Argument n_iterations should be positive"):
+            CustomPeriodicEvent(n_iterations=0)
+        with pytest.raises(TypeError, match="Argument n_iterations should be an integer"):
+            CustomPeriodicEvent(n_iterations=10.0)
+        with pytest.raises(TypeError, match="Argument n_epochs should be an integer"):
+            CustomPeriodicEvent(n_epochs="a")
+        with pytest.raises(ValueError, match="Argument n_epochs should be positive"):
+            CustomPeriodicEvent(n_epochs=0)
+        with pytest.raises(TypeError, match="Argument n_epochs should be an integer"):
+            CustomPeriodicEvent(n_epochs=10.0)
+        with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"):
+            CustomPeriodicEvent()
+        with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"):
+            CustomPeriodicEvent(n_iterations=1, n_epochs=2)
+
+
+def test_new_events():
+    def update(*args, **kwargs):
+        pass
+
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        engine = Engine(update)
+        cpe = CustomPeriodicEvent(n_iterations=5)
+        cpe.attach(engine)
+
+        assert hasattr(cpe, "Events")
+        assert hasattr(cpe.Events, "ITERATIONS_5_STARTED")
+        assert hasattr(cpe.Events, "ITERATIONS_5_COMPLETED")
+
+        assert engine._allowed_events[-2] == getattr(cpe.Events, "ITERATIONS_5_STARTED")
+        assert engine._allowed_events[-1] == getattr(cpe.Events, "ITERATIONS_5_COMPLETED")
+
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        cpe = CustomPeriodicEvent(n_epochs=5)
+        cpe.attach(engine)
+
+        assert hasattr(cpe, "Events")
+        assert hasattr(cpe.Events, "EPOCHS_5_STARTED")
+        assert hasattr(cpe.Events, "EPOCHS_5_COMPLETED")
+
+        assert engine._allowed_events[-2] == getattr(cpe.Events, "EPOCHS_5_STARTED")
+        assert engine._allowed_events[-1] == getattr(cpe.Events, "EPOCHS_5_COMPLETED")
+
+
+def test_integration_iterations():
+    def _test(n_iterations, max_epochs, n_iters_per_epoch):
+        def update(*args, **kwargs):
+            pass
+
+        engine = Engine(update)
+        with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+            cpe = CustomPeriodicEvent(n_iterations=n_iterations)
+            cpe.attach(engine)
+        data = list(range(n_iters_per_epoch))
+
+        custom_period = [0]
+        n_calls_iter_started = [0]
+        n_calls_iter_completed = [0]
+
+        event_started = getattr(cpe.Events, "ITERATIONS_{}_STARTED".format(n_iterations))
+
+        @engine.on(event_started)
+        def on_my_event_started(engine):
+            assert (engine.state.iteration - 1) % n_iterations == 0
+            custom_period[0] += 1
+            custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations))
+            assert custom_iter == custom_period[0]
+            n_calls_iter_started[0] += 1
+
+        event_completed = getattr(cpe.Events, "ITERATIONS_{}_COMPLETED".format(n_iterations))
+
+        @engine.on(event_completed)
+        def on_my_event_ended(engine):
+            assert engine.state.iteration % n_iterations == 0
+            custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations))
+            assert custom_iter == custom_period[0]
+            n_calls_iter_completed[0] += 1
+
+        engine.run(data, max_epochs=max_epochs)
+
+        n = len(data) * max_epochs / n_iterations
+        nf = math.floor(n)
+        assert custom_period[0] == n_calls_iter_started[0]
+        assert n_calls_iter_started[0] == nf + 1 if nf < n else nf
+        assert n_calls_iter_completed[0] == nf
+
+    _test(3, 5, 16)
+    _test(4, 5, 16)
+    _test(5, 5, 16)
+    _test(300, 50, 1000)
+
+
+def test_integration_epochs():
+    def update(*args, **kwargs):
+        pass
+
+    engine = Engine(update)
+
+    n_epochs = 3
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        cpe = CustomPeriodicEvent(n_epochs=n_epochs)
+        cpe.attach(engine)
+    data = list(range(16))
+
+    custom_period = [1]
+
+    @engine.on(cpe.Events.EPOCHS_3_STARTED)
+    def on_my_epoch_started(engine):
+        assert (engine.state.epoch - 1) % n_epochs == 0
+        assert engine.state.epochs_3 == custom_period[0]
+
+    @engine.on(cpe.Events.EPOCHS_3_COMPLETED)
+    def on_my_epoch_ended(engine):
+        assert engine.state.epoch % n_epochs == 0
+        assert engine.state.epochs_3 == custom_period[0]
+        custom_period[0] += 1
+
+    engine.run(data, max_epochs=10)
+
+    assert custom_period[0] == 4
diff --git a/tests/ignite/contrib/handlers/test_mlflow_logger.py b/tests/ignite/contrib/handlers/test_mlflow_logger.py
index 4b8966881673..04bed3e7b912 100644
--- a/tests/ignite/contrib/handlers/test_mlflow_logger.py
+++ b/tests/ignite/contrib/handlers/test_mlflow_logger.py
@@ -14,7 +14,6 @@
 
 
 def test_output_handler_with_wrong_logger_type():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
 
     mock_logger = MagicMock()
@@ -24,7 +23,6 @@ def test_output_handler_with_wrong_logger_type():
 
 
 def test_output_handler_output_transform():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
     mock_logger = MagicMock(spec=MLflowLogger)
     mock_logger.log_metrics = MagicMock()
@@ -47,7 +45,6 @@ def test_output_handler_output_transform():
 
 
 def test_output_handler_metric_names():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b", "c"])
     mock_logger = MagicMock(spec=MLflowLogger)
     mock_logger.log_metrics = MagicMock()
@@ -94,7 +91,6 @@ def test_output_handler_metric_names():
 
 
 def test_output_handler_both():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x})
     mock_logger = MagicMock(spec=MLflowLogger)
     mock_logger.log_metrics = MagicMock()
@@ -145,7 +141,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -201,7 +196,6 @@ def test_output_handler_state_attrs():
 
 
 def test_optimizer_params_handler_wrong_setup():
-
     with pytest.raises(TypeError):
         OptimizerParamsHandler(optimizer=None)
 
@@ -215,7 +209,6 @@ def test_optimizer_params_handler_wrong_setup():
 
 
 def test_optimizer_params():
-
     optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01)
     wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr")
     mock_logger = MagicMock(spec=MLflowLogger)
@@ -237,7 +230,6 @@ def test_optimizer_params():
 
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows")
 def test_integration(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -279,7 +271,6 @@ def dummy_handler(engine, logger, event_name):
 
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows")
 def test_integration_as_context_manager(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -292,7 +283,6 @@ def update_fn(engine, batch):
     true_values = []
 
     with MLflowLogger(str(dirname / "mlruns")) as mlflow_logger:
-
         trainer = Engine(update_fn)
 
         def dummy_handler(engine, logger, event_name):
@@ -324,7 +314,6 @@ def test_mlflow_bad_metric_name_handling(dirname):
 
     true_values = [123.0, 23.4, 333.4]
     with MLflowLogger(str(dirname / "mlruns")) as mlflow_logger:
-
         active_run = mlflow.active_run()
 
         handler = OutputHandler(tag="training", metric_names="all")
@@ -332,7 +321,6 @@ def test_mlflow_bad_metric_name_handling(dirname):
         engine.state = State(metrics={"metric:0 in %": 123.0, "metric 0": 1000.0})
 
         with pytest.warns(UserWarning, match=r"MLflowLogger output_handler encountered an invalid metric name"):
-
             engine.state.epoch = 1
             handler(engine, mlflow_logger, event_name=Events.EPOCH_COMPLETED)
 
@@ -352,6 +340,5 @@ def test_mlflow_bad_metric_name_handling(dirname):
 
 @pytest.mark.parametrize("no_site_packages", ["mlflow"], indirect=True)
 def test_no_mlflow_client(no_site_packages):
-
     with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires mlflow to be installed."):
         MLflowLogger()
diff --git a/tests/ignite/contrib/handlers/test_neptune_logger.py b/tests/ignite/contrib/handlers/test_neptune_logger.py
index 4a428b14eff0..84d91c75577e 100644
--- a/tests/ignite/contrib/handlers/test_neptune_logger.py
+++ b/tests/ignite/contrib/handlers/test_neptune_logger.py
@@ -488,7 +488,6 @@ def dummy_handler(engine, logger, event_name):
 
 
 def test_neptune_saver_serializable(dirname):
-
     mock_logger = MagicMock(spec=NeptuneLogger)
     mock_logger.upload = MagicMock()
     model = torch.nn.Module()
@@ -503,7 +502,6 @@ def test_neptune_saver_serializable(dirname):
 
 @pytest.mark.parametrize("model, serializable", [(lambda x: x, False), (torch.nn.Module().to("cpu"), True)])
 def test_neptune_saver(model, serializable):
-
     mock_logger = MagicMock(spec=NeptuneLogger)
     mock_logger.upload = MagicMock()
 
diff --git a/tests/ignite/contrib/handlers/test_polyaxon_logger.py b/tests/ignite/contrib/handlers/test_polyaxon_logger.py
index 940a3c838d96..1d025da036ca 100644
--- a/tests/ignite/contrib/handlers/test_polyaxon_logger.py
+++ b/tests/ignite/contrib/handlers/test_polyaxon_logger.py
@@ -16,7 +16,6 @@
 
 
 def test_output_handler_with_wrong_logger_type():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
 
     mock_logger = MagicMock()
@@ -26,7 +25,6 @@ def test_output_handler_with_wrong_logger_type():
 
 
 def test_output_handler_output_transform():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
     mock_logger = MagicMock(spec=PolyaxonLogger)
     mock_logger.log_metrics = MagicMock()
@@ -49,7 +47,6 @@ def test_output_handler_output_transform():
 
 
 def test_output_handler_metric_names():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b", "c"])
     mock_logger = MagicMock(spec=PolyaxonLogger)
     mock_logger.log_metrics = MagicMock()
@@ -110,7 +107,6 @@ def test_output_handler_metric_names():
 
 
 def test_output_handler_both():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x})
     mock_logger = MagicMock(spec=PolyaxonLogger)
     mock_logger.log_metrics = MagicMock()
@@ -161,7 +157,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -217,7 +212,6 @@ def test_output_handler_state_attrs():
 
 
 def test_optimizer_params_handler_wrong_setup():
-
     with pytest.raises(TypeError):
         OptimizerParamsHandler(optimizer=None)
 
@@ -231,7 +225,6 @@ def test_optimizer_params_handler_wrong_setup():
 
 
 def test_optimizer_params():
-
     optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01)
     wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr")
     mock_logger = MagicMock(spec=PolyaxonLogger)
@@ -252,7 +245,6 @@ def test_optimizer_params():
 
 
 def test_integration():
-
     n_epochs = 5
     data = list(range(50))
 
@@ -277,7 +269,6 @@ def dummy_handler(engine, logger, event_name):
 
 
 def test_integration_as_context_manager():
-
     n_epochs = 5
     data = list(range(50))
 
@@ -288,7 +279,6 @@ def update_fn(engine, batch):
         return next(losses_iter)
 
     with PolyaxonLogger() as plx_logger:
-
         trainer = Engine(update_fn)
 
         def dummy_handler(engine, logger, event_name):
@@ -302,6 +292,5 @@ def dummy_handler(engine, logger, event_name):
 
 @pytest.mark.parametrize("no_site_packages", ["polyaxon"], indirect=True)
 def test_no_polyaxon_client(no_site_packages):
-
     with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires polyaxon"):
         PolyaxonLogger()
diff --git a/tests/ignite/contrib/handlers/test_tensorboard_logger.py b/tests/ignite/contrib/handlers/test_tensorboard_logger.py
index 60c8a1f4483c..7effd41f046a 100644
--- a/tests/ignite/contrib/handlers/test_tensorboard_logger.py
+++ b/tests/ignite/contrib/handlers/test_tensorboard_logger.py
@@ -19,7 +19,6 @@
 
 
 def test_optimizer_params_handler_wrong_setup():
-
     with pytest.raises(TypeError):
         OptimizerParamsHandler(optimizer=None)
 
@@ -44,7 +43,6 @@ def test_getattr_method():
 
 
 def test_optimizer_params():
-
     optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01)
     wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr")
     mock_logger = MagicMock(spec=TensorboardLogger)
@@ -65,7 +63,6 @@ def test_optimizer_params():
 
 
 def test_output_handler_with_wrong_logger_type():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
 
     mock_logger = MagicMock()
@@ -75,7 +72,6 @@ def test_output_handler_with_wrong_logger_type():
 
 
 def test_output_handler_output_transform():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
     mock_logger = MagicMock(spec=TensorboardLogger)
     mock_logger.writer = MagicMock()
@@ -98,7 +94,6 @@ def test_output_handler_output_transform():
 
 
 def test_output_handler_metric_names():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"])
     mock_logger = MagicMock(spec=TensorboardLogger)
     mock_logger.writer = MagicMock()
@@ -176,7 +171,6 @@ def test_output_handler_metric_names():
 
 
 def test_output_handler_both():
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x})
     mock_logger = MagicMock(spec=TensorboardLogger)
     mock_logger.writer = MagicMock()
@@ -212,7 +206,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -267,7 +260,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_weights_scalar_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = WeightsScalarHandler(model)
     mock_logger = MagicMock()
@@ -277,7 +269,6 @@ def test_weights_scalar_handler_wrong_setup():
 
 
 def test_weights_scalar_handler(dummy_model_factory):
-
     model = dummy_model_factory(with_grads=True, with_frozen_layer=False)
 
     # define test wrapper to test with and without optional tag
@@ -310,7 +301,6 @@ def _test(tag=None):
 
 
 def test_weights_scalar_handler_whitelist(dummy_model_factory):
-
     model = dummy_model_factory()
 
     wrapper = WeightsScalarHandler(model, whitelist=["fc2.weight"])
@@ -355,7 +345,6 @@ def weight_selector(n, _):
 
 
 def test_weights_hist_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = WeightsHistHandler(model)
     mock_logger = MagicMock()
@@ -365,7 +354,6 @@ def test_weights_hist_handler_wrong_setup():
 
 
 def test_weights_hist_handler(dummy_model_factory):
-
     model = dummy_model_factory(with_grads=True, with_frozen_layer=False)
 
     # define test wrapper to test with and without optional tag
@@ -442,7 +430,6 @@ def weight_selector(n, _):
 
 
 def test_grads_scalar_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = GradsScalarHandler(model)
     mock_logger = MagicMock()
@@ -530,7 +517,6 @@ def weight_selector(n, _):
 
 
 def test_grads_hist_handler_wrong_setup():
-
     model = MagicMock(spec=torch.nn.Module)
     wrapper = GradsHistHandler(model)
     mock_logger = MagicMock()
@@ -616,7 +602,6 @@ def weight_selector(n, _):
 
 
 def test_integration(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -646,7 +631,6 @@ def dummy_handler(engine, logger, event_name):
 
 
 def test_integration_as_context_manager(dirname):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -657,7 +641,6 @@ def update_fn(engine, batch):
         return next(losses_iter)
 
     with TensorboardLogger(log_dir=dirname) as tb_logger:
-
         trainer = Engine(update_fn)
 
         def dummy_handler(engine, logger, event_name):
diff --git a/tests/ignite/contrib/handlers/test_tqdm_logger.py b/tests/ignite/contrib/handlers/test_tqdm_logger.py
index 23068e85b3b1..81522b1d0a67 100644
--- a/tests/ignite/contrib/handlers/test_tqdm_logger.py
+++ b/tests/ignite/contrib/handlers/test_tqdm_logger.py
@@ -9,7 +9,7 @@
 import torch
 from packaging.version import Version
 
-from ignite.contrib.handlers import ProgressBar
+from ignite.contrib.handlers import CustomPeriodicEvent, ProgressBar
 from ignite.engine import Engine, Events
 from ignite.handlers import TerminateOnNan
 from ignite.metrics import RunningAverage
@@ -41,7 +41,6 @@ def test_pbar_errors():
 
 
 def test_pbar(capsys):
-
     n_epochs = 2
     loader = [1, 2]
     engine = Engine(update_fn)
@@ -143,7 +142,6 @@ def print_iter(_):
 
 
 def test_pbar_with_metric(capsys):
-
     n_iters = 2
     data = list(range(n_iters))
     loss_values = iter(range(n_iters))
@@ -174,7 +172,6 @@ def step(engine, batch):
 
 
 def test_pbar_with_all_metric(capsys):
-
     n_iters = 2
     data = list(range(n_iters))
     loss_values = iter(range(n_iters))
@@ -208,7 +205,6 @@ def step(engine, batch):
 
 
 def test_pbar_with_state_attrs(capsys):
-
     n_iters = 2
     data = list(range(n_iters))
     loss_values = iter(range(n_iters))
@@ -246,7 +242,6 @@ def step(engine, batch):
 
 
 def test_pbar_no_metric_names(capsys):
-
     n_epochs = 2
     loader = [1, 2]
     engine = Engine(update_fn)
@@ -412,7 +407,6 @@ def update_fn(engine, batch):
 
 
 def test_pbar_on_epochs(capsys):
-
     n_epochs = 10
     loader = [1, 2, 3, 4, 5]
     engine = Engine(update_fn)
@@ -452,7 +446,6 @@ def test_pbar_with_max_epochs_set_to_one(capsys):
 
 
 def test_pbar_wrong_events_order():
-
     engine = Engine(update_fn)
     pbar = ProgressBar()
 
@@ -475,6 +468,16 @@ def test_pbar_wrong_events_order():
         pbar.attach(engine, event_name=Events.ITERATION_STARTED, closing_event_name=Events.EPOCH_COMPLETED(every=10))
 
 
+def test_pbar_on_custom_events(capsys):
+    engine = Engine(update_fn)
+    pbar = ProgressBar()
+    with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"):
+        cpe = CustomPeriodicEvent(n_iterations=15)
+
+    with pytest.raises(ValueError, match=r"not in allowed events for this engine"):
+        pbar.attach(engine, event_name=cpe.Events.ITERATIONS_15_COMPLETED, closing_event_name=Events.EPOCH_COMPLETED)
+
+
 def test_pbar_with_nan_input():
     def update(engine, batch):
         x = batch
@@ -504,7 +507,6 @@ def create_engine():
 
 
 def test_pbar_on_callable_events(capsys):
-
     n_epochs = 1
     loader = list(range(100))
     engine = Engine(update_fn)
@@ -539,7 +541,6 @@ def test_tqdm_logger_epoch_length(capsys):
 
 
 def test_tqdm_logger_iter_without_epoch_length(capsys):
-
     size = 11
 
     def finite_size_data_iter(size):
diff --git a/tests/ignite/contrib/handlers/test_visdom_logger.py b/tests/ignite/contrib/handlers/test_visdom_logger.py
index 1b980ffbac40..39db6558bd4f 100644
--- a/tests/ignite/contrib/handlers/test_visdom_logger.py
+++ b/tests/ignite/contrib/handlers/test_visdom_logger.py
@@ -17,7 +17,6 @@
 
 
 def test_optimizer_params_handler_wrong_setup():
-
     with pytest.raises(TypeError):
         OptimizerParamsHandler(optimizer=None)
 
@@ -31,7 +30,6 @@ def test_optimizer_params_handler_wrong_setup():
 
 
 def test_optimizer_params():
-
     optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01)
     wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr")
     mock_logger = MagicMock(spec=VisdomLogger)
@@ -79,7 +77,6 @@ def test_optimizer_params():
 
 
 def test_output_handler_with_wrong_logger_type():
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
 
     mock_logger = MagicMock()
@@ -89,7 +86,6 @@ def test_output_handler_with_wrong_logger_type():
 
 
 def test_output_handler_output_transform(dirname):
-
     wrapper = OutputHandler("tag", output_transform=lambda x: x)
     mock_logger = MagicMock(spec=VisdomLogger)
     mock_logger.vis = MagicMock()
@@ -137,7 +133,6 @@ def test_output_handler_output_transform(dirname):
 
 
 def test_output_handler_metric_names(dirname):
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"])
     mock_logger = MagicMock(spec=VisdomLogger)
     mock_logger.vis = MagicMock()
@@ -314,7 +309,6 @@ def test_output_handler_metric_names(dirname):
 
 
 def test_output_handler_both(dirname):
-
     wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x})
     mock_logger = MagicMock(spec=VisdomLogger)
     mock_logger.vis = MagicMock()
@@ -543,7 +537,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -605,7 +598,6 @@ def test_output_handler_with_global_step_from_engine():
 
 
 def test_weights_scalar_handler_wrong_setup():
-
     with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"):
         WeightsScalarHandler(None)
 
@@ -770,7 +762,6 @@ def norm(x):
 
 
 def test_grads_scalar_handler_wrong_setup():
-
     with pytest.raises(TypeError, match="Argument model should be of type torch.nn.Module"):
         GradsScalarHandler(None)
 
@@ -852,7 +843,6 @@ def _test(tag=None):
 
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows")
 def test_integration_no_server():
-
     with pytest.raises(ConnectionError, match="Error connecting to Visdom server"):
         VisdomLogger()
 
@@ -958,7 +948,6 @@ def update_fn(engine, batch):
 
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows")
 def test_integration_with_executor_as_context_manager(visdom_server, visdom_server_stop):
-
     n_epochs = 5
     data = list(range(50))
 
@@ -969,7 +958,6 @@ def update_fn(engine, batch):
         return next(losses_iter)
 
     with VisdomLogger(server=visdom_server[0], port=visdom_server[1], num_workers=1) as vd_logger:
-
         # close all windows in 'main' environment
         vd_logger.vis.close()
 
@@ -994,7 +982,6 @@ def update_fn(engine, batch):
 
 @pytest.mark.parametrize("no_site_packages", ["visdom"], indirect=True)
 def test_no_visdom(no_site_packages):
-
     with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires visdom package"):
         VisdomLogger()
 
diff --git a/tests/ignite/contrib/handlers/test_wandb_logger.py b/tests/ignite/contrib/handlers/test_wandb_logger.py
index 102d057281c3..821035568381 100644
--- a/tests/ignite/contrib/handlers/test_wandb_logger.py
+++ b/tests/ignite/contrib/handlers/test_wandb_logger.py
@@ -208,7 +208,6 @@ def global_step_transform(*args, **kwargs):
 
 
 def test_output_handler_with_global_step_from_engine():
-
     mock_another_engine = MagicMock()
     mock_another_engine.state = State()
     mock_another_engine.state.epoch = 10
@@ -283,7 +282,6 @@ def test_wandb_close():
 
 @pytest.mark.parametrize("no_site_packages", ["wandb"], indirect=True)
 def test_no_wandb_client(no_site_packages):
-
     with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires wandb to be installed."):
         WandBLogger()
 
diff --git a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py
index 912e3d22560a..93e2546aa820 100644
--- a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py
+++ b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py
@@ -132,7 +132,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
     canberra = DistanceMetric.get_metric("canberra")
 
@@ -186,7 +185,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -195,7 +193,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -205,7 +202,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -217,7 +213,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -227,7 +222,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py
index a0bc1b30b05d..ef9784697c51 100644
--- a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py
@@ -102,7 +102,6 @@ def get_test_cases():
 
 
 def _test_distrib_compute(device):
-
     rank = idist.get_rank()
 
     def _test(metric_device):
@@ -135,7 +134,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -192,7 +190,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -201,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -222,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -232,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
 
diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py
index 252313da4e86..105e7fe4aac1 100644
--- a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py
+++ b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py
@@ -142,7 +142,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device, tol=1e-5):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -199,7 +198,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -208,7 +206,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -218,7 +215,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -230,7 +226,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -240,7 +235,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py
index 841d14584229..e9d6e42ccf7a 100644
--- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py
@@ -142,7 +142,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -197,7 +196,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -206,7 +204,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -216,7 +213,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -237,7 +233,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py
index 5a03a0dfbb87..ccc7c28de2a5 100644
--- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_relative_absolute_error.py
@@ -46,7 +46,6 @@ def test_compute():
 
 
 def test_integration():
-
     y_pred = torch.rand(size=(100,))
     y = torch.rand(size=(100,))
 
@@ -77,7 +76,6 @@ def update_fn(engine, batch):
 
 
 def _test_distrib_compute(device):
-
     rank = idist.get_rank()
 
     def _test(metric_device):
@@ -107,7 +105,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
     torch.manual_seed(12)
 
@@ -161,7 +158,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -170,7 +166,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -191,7 +186,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -201,7 +195,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
 
diff --git a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py
index ae7606f18372..5b5090d90807 100644
--- a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py
+++ b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py
@@ -132,7 +132,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     manhattan = DistanceMetric.get_metric("manhattan")
@@ -187,7 +186,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -196,7 +194,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -206,7 +203,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -218,7 +214,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -228,7 +223,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py
index 87d94cde4c85..fe6ba11bb007 100644
--- a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py
@@ -130,7 +130,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -183,7 +182,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -192,7 +190,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -202,7 +199,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -214,7 +210,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -224,7 +219,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py
index 56bf6f84628d..81b1fbbbe05a 100644
--- a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py
@@ -152,7 +152,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -207,7 +206,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -216,7 +214,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -226,7 +223,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -238,7 +234,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -248,7 +243,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_mean_error.py b/tests/ignite/contrib/metrics/regression/test_mean_error.py
index 0a8894621da3..39f90f011833 100644
--- a/tests/ignite/contrib/metrics/regression/test_mean_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_mean_error.py
@@ -99,7 +99,6 @@ def get_test_cases():
 
 
 def _test_distrib_compute(device):
-
     rank = idist.get_rank()
 
     def _test(metric_device):
@@ -131,7 +130,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device, tol=1e-5):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -186,7 +184,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -195,7 +192,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -216,7 +212,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -226,7 +221,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
 
diff --git a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py
index ccfa4650821a..7177b01e8c17 100644
--- a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py
+++ b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py
@@ -146,7 +146,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -201,7 +200,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -210,7 +208,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -220,7 +217,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -232,7 +228,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -242,7 +237,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py
index e638abbfa8fc..615d90fbeb1d 100644
--- a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py
@@ -35,7 +35,6 @@ def test_wrong_input_shapes():
 
 
 def test_median_absolute_error():
-
     # See https://github.com/torch/torch7/pull/182
     # For even number of elements, PyTorch returns middle element
     # NumPy returns average of middle elements
@@ -57,7 +56,6 @@ def test_median_absolute_error():
 
 
 def test_median_absolute_error_2():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -80,7 +78,6 @@ def test_median_absolute_error_2():
 
 
 def test_integration_median_absolute_error():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -192,7 +189,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -201,7 +197,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -211,7 +206,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -223,7 +217,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -233,7 +226,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py
index ea6ec1b67c42..2973a28f193c 100644
--- a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py
@@ -35,7 +35,6 @@ def test_wrong_input_shapes():
 
 
 def test_median_absolute_percentage_error():
-
     # See https://github.com/torch/torch7/pull/182
     # For even number of elements, PyTorch returns middle element
     # NumPy returns average of middle elements
@@ -57,7 +56,6 @@ def test_median_absolute_percentage_error():
 
 
 def test_median_absolute_percentage_error_2():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -80,7 +78,6 @@ def test_median_absolute_percentage_error_2():
 
 
 def test_integration_median_absolute_percentage_error():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -193,7 +190,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -202,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -212,7 +207,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -224,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -234,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py
index 92a6e0591300..a43c46c307e3 100644
--- a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py
+++ b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py
@@ -35,7 +35,6 @@ def test_wrong_input_shapes():
 
 
 def test_median_relative_absolute_error():
-
     # See https://github.com/torch/torch7/pull/182
     # For even number of elements, PyTorch returns middle element
     # NumPy returns average of middle elements
@@ -57,7 +56,6 @@ def test_median_relative_absolute_error():
 
 
 def test_median_relative_absolute_error_2():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -80,7 +78,6 @@ def test_median_relative_absolute_error_2():
 
 
 def test_integration_median_relative_absolute_error_with_output_transform():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -193,7 +190,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -202,7 +198,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -212,7 +207,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -224,7 +218,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -234,7 +227,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_r2_score.py b/tests/ignite/contrib/metrics/regression/test_r2_score.py
index 95e131d1d33e..86113c4b532d 100644
--- a/tests/ignite/contrib/metrics/regression/test_r2_score.py
+++ b/tests/ignite/contrib/metrics/regression/test_r2_score.py
@@ -28,7 +28,6 @@ def test_wrong_input_shapes():
 
 
 def test_r2_score():
-
     size = 51
     np_y_pred = np.random.rand(size)
     np_y = np.random.rand(size)
@@ -44,7 +43,6 @@ def test_r2_score():
 
 
 def test_r2_score_2():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -66,7 +64,6 @@ def test_r2_score_2():
 
 
 def test_integration_r2_score():
-
     np.random.seed(1)
     size = 105
     np_y_pred = np.random.rand(size, 1)
@@ -121,7 +118,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -173,7 +169,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -182,7 +177,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -192,7 +186,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -204,7 +197,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -214,7 +206,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py
index 4d92a611bd2c..bb615adb086c 100644
--- a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py
+++ b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py
@@ -115,7 +115,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -168,7 +167,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -177,7 +175,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -187,7 +184,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -199,7 +195,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -209,7 +204,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/test_average_precision.py b/tests/ignite/contrib/metrics/test_average_precision.py
index 22ff66533dc7..7a943ae855e4 100644
--- a/tests/ignite/contrib/metrics/test_average_precision.py
+++ b/tests/ignite/contrib/metrics/test_average_precision.py
@@ -63,102 +63,89 @@ def test_check_shape():
         ap._check_shape((torch.rand(4, 3), torch.rand(4, 3, 1)))
 
 
-def test_binary_and_multilabel_inputs():
+@pytest.fixture(params=[item for item in range(8)])
+def test_data_binary_and_multilabel(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1),
+        # updated batches
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
+        # Binary input data of shape (N, L)
+        (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1),
+        (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1),
+        # updated batches
+        (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16),
+        (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_binary_and_multilabel_inputs(n_times, test_data_binary_and_multilabel):
+    y_pred, y, batch_size = test_data_binary_and_multilabel
     ap = AveragePrecision()
+    ap.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            ap.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        ap.update((y_pred, y))
 
-    def _test(y_pred, y, batch_size):
-        ap.reset()
-        if batch_size > 1:
-            n_iters = y.shape[0] // batch_size + 1
-            for i in range(n_iters):
-                idx = i * batch_size
-                ap.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
-        else:
-            ap.update((y_pred, y))
-
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
-
-        res = ap.compute()
-        assert isinstance(res, float)
-        assert average_precision_score(np_y, np_y_pred) == pytest.approx(res)
-
-    def get_test_cases():
-
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1),
-            # updated batches
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
-            # Binary input data of shape (N, L)
-            (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1),
-            (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1),
-            # updated batches
-            (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16),
-            (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16),
-        ]
-
-        return test_cases
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-    for _ in range(5):
-        # check multiple random inputs as random exact occurencies are rare
-        test_cases = get_test_cases()
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    res = ap.compute()
+    assert isinstance(res, float)
+    assert average_precision_score(np_y, np_y_pred) == pytest.approx(res)
 
 
-def test_integration_binary_and_mulitlabel_inputs():
-    def _test(y_pred, y, batch_size):
-        def update_fn(engine, batch):
-            idx = (engine.state.iteration - 1) * batch_size
-            y_true_batch = np_y[idx : idx + batch_size]
-            y_pred_batch = np_y_pred[idx : idx + batch_size]
-            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+@pytest.fixture(params=[item for item in range(4)])
+def test_data_integration_binary_and_multilabel(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10),
+        (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10),
+        # Binary input data of shape (N, L)
+        (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10),
+        (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10),
+    ][request.param]
 
-        engine = Engine(update_fn)
 
-        ap_metric = AveragePrecision()
-        ap_metric.attach(engine, "ap")
+@pytest.mark.parametrize("n_times", range(5))
+def test_integration_binary_and_mulitlabel_inputs(n_times, test_data_integration_binary_and_multilabel):
+    y_pred, y, batch_size = test_data_integration_binary_and_multilabel
 
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
+    def update_fn(engine, batch):
+        idx = (engine.state.iteration - 1) * batch_size
+        y_true_batch = np_y[idx : idx + batch_size]
+        y_pred_batch = np_y_pred[idx : idx + batch_size]
+        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
 
-        np_ap = average_precision_score(np_y, np_y_pred)
+    engine = Engine(update_fn)
 
-        data = list(range(y_pred.shape[0] // batch_size))
-        ap = engine.run(data, max_epochs=1).metrics["ap"]
+    ap_metric = AveragePrecision()
+    ap_metric.attach(engine, "ap")
 
-        assert isinstance(ap, float)
-        assert np_ap == pytest.approx(ap)
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-    def get_test_cases():
+    np_ap = average_precision_score(np_y, np_y_pred)
 
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10),
-            (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10),
-            # Binary input data of shape (N, L)
-            (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10),
-            (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10),
-        ]
-        return test_cases
+    data = list(range(y_pred.shape[0] // batch_size))
+    ap = engine.run(data, max_epochs=1).metrics["ap"]
 
-    for _ in range(5):
-        # check multiple random inputs as random exact occurencies are rare
-        test_cases = get_test_cases()
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    assert isinstance(ap, float)
+    assert np_ap == pytest.approx(ap)
 
 
 def _test_distrib_binary_and_multilabel_inputs(device):
-
     rank = idist.get_rank()
     torch.manual_seed(12)
 
     def _test(y_pred, y, batch_size, metric_device):
-
         metric_device = torch.device(metric_device)
         ap = AveragePrecision(device=metric_device)
         torch.manual_seed(10 + rank)
@@ -185,7 +172,6 @@ def _test(y_pred, y, batch_size, metric_device):
         assert average_precision_score(np_y, np_y_pred) == pytest.approx(res)
 
     def get_test_cases():
-
         test_cases = [
             # Binary input data of shape (N,) or (N, 1)
             (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1),
@@ -213,7 +199,6 @@ def get_test_cases():
 
 
 def _test_distrib_integration_binary_input(device):
-
     rank = idist.get_rank()
     n_iters = 80
     batch_size = 16
@@ -283,7 +268,6 @@ def update_fn(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -292,7 +276,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -302,7 +285,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -314,7 +296,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -324,7 +305,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -334,7 +314,6 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
diff --git a/tests/ignite/contrib/metrics/test_cohen_kappa.py b/tests/ignite/contrib/metrics/test_cohen_kappa.py
index ea199cd624d1..fa73a84cdfae 100644
--- a/tests/ignite/contrib/metrics/test_cohen_kappa.py
+++ b/tests/ignite/contrib/metrics/test_cohen_kappa.py
@@ -71,44 +71,38 @@ def test_cohen_kappa_wrong_weights_type():
         ck = CohenKappa(weights="dd")
 
 
-@pytest.mark.parametrize("weights", [None, "linear", "quadratic"])
-def test_binary_input(weights):
+@pytest.fixture(params=range(4))
+def test_data_binary(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1),
+        (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1),
+        # updated batches
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
+    ][request.param]
 
-    ck = CohenKappa(weights)
 
-    def _test(y_pred, y, batch_size):
-        ck.reset()
-        if batch_size > 1:
-            n_iters = y.shape[0] // batch_size + 1
-            for i in range(n_iters):
-                idx = i * batch_size
-                ck.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
-        else:
-            ck.update((y_pred, y))
+@pytest.mark.parametrize("n_times", range(5))
+@pytest.mark.parametrize("weights", [None, "linear", "quadratic"])
+def test_binary_input(n_times, weights, test_data_binary):
+    y_pred, y, batch_size = test_data_binary
+    ck = CohenKappa(weights)
+    ck.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            ck.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        ck.update((y_pred, y))
 
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-        res = ck.compute()
-        assert isinstance(res, float)
-        assert cohen_kappa_score(np_y, np_y_pred, weights=weights) == pytest.approx(res)
-
-    def get_test_cases():
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(10,)).long(), torch.randint(0, 2, size=(10,)).long(), 1),
-            (torch.randint(0, 2, size=(10, 1)).long(), torch.randint(0, 2, size=(10, 1)).long(), 1),
-            # updated batches
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
-        ]
-        return test_cases
-
-    for _ in range(5):
-        # check multiple random inputs as random exact occurencies are rare
-        test_cases = get_test_cases()
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    res = ck.compute()
+    assert isinstance(res, float)
+    assert cohen_kappa_score(np_y, np_y_pred, weights=weights) == pytest.approx(res)
 
 
 def test_multilabel_inputs():
@@ -130,52 +124,47 @@ def test_multilabel_inputs():
         ck.compute()
 
 
-@pytest.mark.parametrize("weights", [None, "linear", "quadratic"])
-def test_integration_binary_input(weights):
-    def _test(y_pred, y, batch_size):
-        def update_fn(engine, batch):
-            idx = (engine.state.iteration - 1) * batch_size
-            y_true_batch = np_y[idx : idx + batch_size]
-            y_pred_batch = np_y_pred[idx : idx + batch_size]
-            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+@pytest.fixture(params=range(2))
+def test_data_integration_binary(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 10),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 10),
+    ][request.param]
 
-        engine = Engine(update_fn)
 
-        ck_metric = CohenKappa(weights=weights)
-        ck_metric.attach(engine, "ck")
+@pytest.mark.parametrize("n_times", range(5))
+@pytest.mark.parametrize("weights", [None, "linear", "quadratic"])
+def test_integration_binary_input(n_times, weights, test_data_integration_binary):
+    y_pred, y, batch_size = test_data_integration_binary
 
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
+    def update_fn(engine, batch):
+        idx = (engine.state.iteration - 1) * batch_size
+        y_true_batch = np_y[idx : idx + batch_size]
+        y_pred_batch = np_y_pred[idx : idx + batch_size]
+        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
 
-        np_ck = cohen_kappa_score(np_y, np_y_pred, weights=weights)
+    engine = Engine(update_fn)
 
-        data = list(range(y_pred.shape[0] // batch_size))
-        ck = engine.run(data, max_epochs=1).metrics["ck"]
+    ck_metric = CohenKappa(weights=weights)
+    ck_metric.attach(engine, "ck")
 
-        assert isinstance(ck, float)
-        assert np_ck == pytest.approx(ck)
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-    def get_test_cases():
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 10),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 10),
-        ]
-        return test_cases
+    np_ck = cohen_kappa_score(np_y, np_y_pred, weights=weights)
 
-    for _ in range(5):
-        # check multiple random inputs as random exact occurencies are rare
-        test_cases = get_test_cases()
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    data = list(range(y_pred.shape[0] // batch_size))
+    ck = engine.run(data, max_epochs=1).metrics["ck"]
 
+    assert isinstance(ck, float)
+    assert np_ck == pytest.approx(ck)
 
-def _test_distrib_binary_input(device):
 
+def _test_distrib_binary_input(device):
     rank = idist.get_rank()
 
     def _test(y_pred, y, batch_size, metric_device):
-
         metric_device = torch.device(metric_device)
         ck = CohenKappa(device=metric_device)
 
@@ -220,7 +209,6 @@ def get_test_cases():
 
 
 def _test_distrib_integration_binary_input(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -274,7 +262,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
@@ -283,7 +270,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
@@ -293,7 +279,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -305,7 +290,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
@@ -315,7 +299,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
@@ -325,14 +308,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
 
 
 def _test_distrib_xla_nprocs(index):
-
     device = idist.device()
     _test_distrib_binary_input(device)
     _test_distrib_integration_binary_input(device)
diff --git a/tests/ignite/contrib/metrics/test_precision_recall_curve.py b/tests/ignite/contrib/metrics/test_precision_recall_curve.py
index 6ad007747ed4..1eaf8ddc8b3c 100644
--- a/tests/ignite/contrib/metrics/test_precision_recall_curve.py
+++ b/tests/ignite/contrib/metrics/test_precision_recall_curve.py
@@ -141,7 +141,6 @@ def _test_distrib_compute(device):
     rank = idist.get_rank()
 
     def _test(y_pred, y, batch_size, metric_device):
-
         metric_device = torch.device(metric_device)
         prc = PrecisionRecallCurve(device=metric_device)
 
@@ -191,7 +190,6 @@ def get_test_cases():
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -249,7 +247,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -258,7 +255,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -268,7 +264,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -280,7 +275,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
@@ -290,7 +284,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute(device)
     _test_distrib_integration(device)
diff --git a/tests/ignite/contrib/metrics/test_roc_auc.py b/tests/ignite/contrib/metrics/test_roc_auc.py
index a29fc73dd668..dcc14aaba301 100644
--- a/tests/ignite/contrib/metrics/test_roc_auc.py
+++ b/tests/ignite/contrib/metrics/test_roc_auc.py
@@ -64,49 +64,43 @@ def test_check_shape():
         roc_auc._check_shape((torch.rand(4, 3), torch.rand(4, 3, 1)))
 
 
-def test_binary_and_multilabel_inputs():
-
+@pytest.fixture(params=range(8))
+def test_data_binary_and_multilabel(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1),
+        # updated batches
+        (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
+        (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
+        # Binary input data of shape (N, L)
+        (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1),
+        (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1),
+        # updated batches
+        (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16),
+        (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_binary_and_multilabel_inputs(n_times, test_data_binary_and_multilabel):
+    y_pred, y, batch_size = test_data_binary_and_multilabel
     roc_auc = ROC_AUC()
+    roc_auc.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            roc_auc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        roc_auc.update((y_pred, y))
 
-    def _test(y_pred, y, batch_size):
-        roc_auc.reset()
-        if batch_size > 1:
-            n_iters = y.shape[0] // batch_size + 1
-            for i in range(n_iters):
-                idx = i * batch_size
-                roc_auc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
-        else:
-            roc_auc.update((y_pred, y))
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
-
-        res = roc_auc.compute()
-        assert isinstance(res, float)
-        assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res)
-
-    def get_test_cases():
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 1),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 1),
-            # updated batches
-            (torch.randint(0, 2, size=(50,)).long(), torch.randint(0, 2, size=(50,)).long(), 16),
-            (torch.randint(0, 2, size=(50, 1)).long(), torch.randint(0, 2, size=(50, 1)).long(), 16),
-            # Binary input data of shape (N, L)
-            (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 1),
-            (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 1),
-            # updated batches
-            (torch.randint(0, 2, size=(50, 4)).long(), torch.randint(0, 2, size=(50, 4)).long(), 16),
-            (torch.randint(0, 2, size=(50, 7)).long(), torch.randint(0, 2, size=(50, 7)).long(), 16),
-        ]
-        return test_cases
-
-    for _ in range(5):
-        test_cases = get_test_cases()
-        # check multiple random inputs as random exact occurencies are rare
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    res = roc_auc.compute()
+    assert isinstance(res, float)
+    assert roc_auc_score(np_y, np_y_pred) == pytest.approx(res)
 
 
 def test_check_compute_fn():
@@ -125,50 +119,46 @@ def test_check_compute_fn():
     em.update(output)
 
 
-def test_integration_binary_and_multilabel_inputs():
-    def _test(y_pred, y, batch_size):
-        def update_fn(engine, batch):
-            idx = (engine.state.iteration - 1) * batch_size
-            y_true_batch = np_y[idx : idx + batch_size]
-            y_pred_batch = np_y_pred[idx : idx + batch_size]
-            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+@pytest.fixture(params=range(4))
+def test_data_integration_binary_and_multilabel(request):
+    return [
+        # Binary input data of shape (N,) or (N, 1)
+        (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10),
+        (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10),
+        # Binary input data of shape (N, L)
+        (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10),
+        (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10),
+    ][request.param]
 
-        engine = Engine(update_fn)
 
-        roc_auc_metric = ROC_AUC()
-        roc_auc_metric.attach(engine, "roc_auc")
+@pytest.mark.parametrize("n_times", range(5))
+def test_integration_binary_and_multilabel_inputs(n_times, test_data_integration_binary_and_multilabel):
+    y_pred, y, batch_size = test_data_integration_binary_and_multilabel
 
-        np_y = y.numpy()
-        np_y_pred = y_pred.numpy()
+    def update_fn(engine, batch):
+        idx = (engine.state.iteration - 1) * batch_size
+        y_true_batch = np_y[idx : idx + batch_size]
+        y_pred_batch = np_y_pred[idx : idx + batch_size]
+        return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
 
-        np_roc_auc = roc_auc_score(np_y, np_y_pred)
+    engine = Engine(update_fn)
 
-        data = list(range(y_pred.shape[0] // batch_size))
-        roc_auc = engine.run(data, max_epochs=1).metrics["roc_auc"]
+    roc_auc_metric = ROC_AUC()
+    roc_auc_metric.attach(engine, "roc_auc")
 
-        assert isinstance(roc_auc, float)
-        assert np_roc_auc == pytest.approx(roc_auc)
+    np_y = y.numpy()
+    np_y_pred = y_pred.numpy()
 
-    def get_test_cases():
-        test_cases = [
-            # Binary input data of shape (N,) or (N, 1)
-            (torch.randint(0, 2, size=(100,)).long(), torch.randint(0, 2, size=(100,)).long(), 10),
-            (torch.randint(0, 2, size=(100, 1)).long(), torch.randint(0, 2, size=(100, 1)).long(), 10),
-            # Binary input data of shape (N, L)
-            (torch.randint(0, 2, size=(100, 3)).long(), torch.randint(0, 2, size=(100, 3)).long(), 10),
-            (torch.randint(0, 2, size=(100, 4)).long(), torch.randint(0, 2, size=(100, 4)).long(), 10),
-        ]
-        return test_cases
+    np_roc_auc = roc_auc_score(np_y, np_y_pred)
 
-    for _ in range(5):
-        # check multiple random inputs as random exact occurencies are rare
-        test_cases = get_test_cases()
-        for y_pred, y, batch_size in test_cases:
-            _test(y_pred, y, batch_size)
+    data = list(range(y_pred.shape[0] // batch_size))
+    roc_auc = engine.run(data, max_epochs=1).metrics["roc_auc"]
 
+    assert isinstance(roc_auc, float)
+    assert np_roc_auc == pytest.approx(roc_auc)
 
-def _test_distrib_binary_and_multilabel_inputs(device):
 
+def _test_distrib_binary_and_multilabel_inputs(device):
     rank = idist.get_rank()
 
     def _test(y_pred, y, batch_size, metric_device):
@@ -222,7 +212,6 @@ def get_test_cases():
 
 
 def _test_distrib_integration_binary_input(device):
-
     rank = idist.get_rank()
     n_iters = 80
     batch_size = 16
@@ -292,7 +281,6 @@ def update_fn(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -301,7 +289,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -311,7 +298,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -323,7 +309,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -333,7 +318,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
@@ -343,14 +327,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
 
 
 def _test_distrib_xla_nprocs(index):
-
     device = idist.device()
     _test_distrib_binary_and_multilabel_inputs(device)
     _test_distrib_integration_binary_input(device)
diff --git a/tests/ignite/distributed/check_idist_parallel.py b/tests/ignite/distributed/check_idist_parallel.py
index def9e798a24b..04e294cbfbeb 100644
--- a/tests/ignite/distributed/check_idist_parallel.py
+++ b/tests/ignite/distributed/check_idist_parallel.py
@@ -6,7 +6,6 @@
 
 
 def training(local_rank, config, **kwargs):
-
     import time
 
     time.sleep(idist.get_rank() * 0.1)
diff --git a/tests/ignite/distributed/comp_models/test_horovod.py b/tests/ignite/distributed/comp_models/test_horovod.py
index e795f887ed62..264813cd584b 100644
--- a/tests/ignite/distributed/comp_models/test_horovod.py
+++ b/tests/ignite/distributed/comp_models/test_horovod.py
@@ -18,7 +18,6 @@ def test__hvd_dist_model():
 
 
 def _assert_model(model, true_conf):
-
     if "cuda" in true_conf["device"]:
         assert model.device() == torch.device(f"{true_conf['device']}:{true_conf['local_rank']}")
     else:
@@ -33,7 +32,6 @@ def _assert_model(model, true_conf):
 
 
 def _test__hvd_dist_model_create_from_backend_no_dist(backend, true_device):
-
     model = _HorovodDistModel.create_from_backend(backend=backend)
 
     assert hvd.rank() > -1
@@ -54,7 +52,6 @@ def _test__hvd_dist_model_create_from_backend_no_dist(backend, true_device):
 
 
 def _test__hvd_dist_model_create_from_backend_dist(backend, true_device):
-
     model = _HorovodDistModel.create_from_backend(backend=backend)
 
     assert hvd.rank() > -1
@@ -79,7 +76,6 @@ def _test__hvd_dist_model_create_from_backend_dist(backend, true_device):
 
 
 def _test__hvd_dist_model_create_from_context_no_dist(true_backend, true_device):
-
     with pytest.raises(ValueError, match=r"Horovod has not been initialized"):
         hvd.rank()
 
@@ -105,7 +101,6 @@ def _test__hvd_dist_model_create_from_context_no_dist(true_backend, true_device)
 
 
 def _test__hvd_dist_model_create_from_context_dist(true_backend, true_device):
-
     assert _HorovodDistModel.create_from_context() is None
 
     hvd.init()
@@ -169,7 +164,6 @@ def test__hvd_dist_model_create_dist_cuda_2(gloo_hvd_executor):
 
 
 def _test__hvd_dist_model_warning_index_less_localrank():
-
     assert torch.cuda.is_available()
     assert _HorovodDistModel.create_from_context() is None
 
@@ -190,7 +184,7 @@ def _test__hvd_dist_model_warning_index_less_localrank():
 @pytest.mark.distributed
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
 def test__hvd_dist_model_warning_index_less_localrank(gloo_hvd_executor):
-    gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), num_proc=torch.cuda.device_count())
+    gloo_hvd_executor(_test__hvd_dist_model_warning_index_less_localrank, (), np=torch.cuda.device_count())
 
 
 def _test_dist_spawn_fn(local_rank, backend, world_size, device):
diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py
index 875fd78431e2..09e4d3054601 100644
--- a/tests/ignite/distributed/comp_models/test_native.py
+++ b/tests/ignite/distributed/comp_models/test_native.py
@@ -95,7 +95,6 @@ def test__native_dist_model():
 @pytest.mark.skipif(not dist.is_nccl_available(), reason="Skip if nccl not available")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test__native_nccl_but_no_gpu(mock_gpu_is_not_available):
-
     with pytest.raises(RuntimeError, match=r"Nccl backend is required but no cuda capable devices"):
         _NativeDistModel(backend="nccl")
 
@@ -152,7 +151,6 @@ def test__native_dist_model_create_from_backend_bad_slurm_config():
 
 
 def _assert_model(model, true_conf):
-
     assert model.device() == torch.device(true_conf["device"])
     assert model.get_local_rank() == true_conf["local_rank"]
     assert model.get_rank() == true_conf["rank"]
@@ -188,7 +186,6 @@ def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
 
 
 def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, rank, world_size, backend, true_device):
-
     import os
     from datetime import timedelta
 
@@ -234,7 +231,6 @@ def _test__native_dist_model_create_from_backend_dist(init_method, local_rank, r
 
 
 def _test__native_dist_model_create_from_backend_slurm(local_rank, rank, world_size, backend, true_device):
-
     import os
     from datetime import timedelta
 
@@ -292,7 +288,6 @@ def _test__native_dist_model_create_from_backend_slurm(local_rank, rank, world_s
 
 
 def _test__native_dist_model_create_from_context_no_local_rank():
-
     if "LOCAL_RANK" in os.environ:
         del os.environ["LOCAL_RANK"]
 
@@ -321,7 +316,6 @@ def _test__native_dist_model_create_from_context_env_local_rank(true_conf):
 
 
 def _test__native_dist_model_create_from_context_set_local_rank(true_conf):
-
     from ignite.distributed.comp_models.base import ComputationModel
 
     lrank = None
@@ -341,7 +335,6 @@ def _test__native_dist_model_create_from_context_set_local_rank(true_conf):
 
 
 def _test__native_dist_model_create_from_context_no_dist(true_backend, true_device):
-
     assert _NativeDistModel.create_from_context() is None
 
     dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=1, rank=0)
@@ -366,7 +359,6 @@ def _test__native_dist_model_create_from_context_no_dist(true_backend, true_devi
 
 
 def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):
-
     assert _NativeDistModel.create_from_context() is None
 
     dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=world_size, rank=rank)
@@ -422,7 +414,6 @@ def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, l
 
 @pytest.mark.distributed
 def test__native_dist_model_create_dist_gloo_2(local_rank, world_size):
-
     device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu")
     _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", device)
     _test__native_dist_model_create_from_backend_slurm(local_rank, local_rank, world_size, "gloo", device)
@@ -454,7 +445,6 @@ def test__native_dist_model_create_dist_nccl_2(local_rank, world_size):
 @pytest.mark.distributed
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Skip if less than 2 GPUs")
 def test__native_dist_model_warning_index_less_localrank(local_rank, world_size):
-
     assert _NativeDistModel.create_from_context() is None
 
     dist.init_process_group("nccl", "tcp://0.0.0.0:2222", world_size=world_size, rank=local_rank)
diff --git a/tests/ignite/distributed/comp_models/test_xla.py b/tests/ignite/distributed/comp_models/test_xla.py
index 001a7741b596..6352895833d0 100644
--- a/tests/ignite/distributed/comp_models/test_xla.py
+++ b/tests/ignite/distributed/comp_models/test_xla.py
@@ -59,7 +59,6 @@ def test__xla_dist_model_spawn_n_procs():
 
 
 def _assert_model(model, true_conf):
-
     assert model.device() == true_conf["device"]
     assert model.get_local_rank() == true_conf["local_rank"]
     assert model.get_rank() == true_conf["rank"]
diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py
index 60c1c12d5788..cde9892b8dec 100644
--- a/tests/ignite/distributed/test_auto.py
+++ b/tests/ignite/distributed/test_auto.py
@@ -180,7 +180,6 @@ def _test_auto_model_optimizer(ws, device):
 
 
 def test_auto_methods_no_dist():
-
     _test_auto_dataloader(1, 1, batch_size=1)
     _test_auto_dataloader(1, 1, batch_size=10, num_workers=2)
     _test_auto_dataloader(1, 1, batch_size=10, sampler_name="WeightedRandomSampler")
@@ -192,7 +191,6 @@ def test_auto_methods_no_dist():
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_auto_methods_gloo(distributed_context_single_node_gloo):
-
     ws = distributed_context_single_node_gloo["world_size"]
     _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
     _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2)
@@ -217,7 +215,6 @@ def test_auto_methods_gloo(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_auto_methods_nccl(distributed_context_single_node_nccl):
-
     ws = distributed_context_single_node_nccl["world_size"]
     _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1)
     _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10)
@@ -236,7 +233,6 @@ def test_auto_methods_nccl(distributed_context_single_node_nccl):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_auto_methods_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -249,10 +245,8 @@ def test_auto_methods_hvd(gloo_hvd_executor):
 
 
 def _test_auto_methods_xla(index, ws):
-
     dl_type = DataLoader
     if ws > 1:
-
         from ignite.distributed.auto import _MpDeviceLoader
 
         dl_type = _MpDeviceLoader
@@ -288,7 +282,6 @@ def test_auto_methods_xla():
 
 
 def test_dist_proxy_sampler():
-
     weights = torch.ones(100)
     weights[:50] += 1
     num_samples = 200
diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py
index e058988658a8..04e1e20b7c07 100644
--- a/tests/ignite/distributed/test_launcher.py
+++ b/tests/ignite/distributed/test_launcher.py
@@ -42,7 +42,6 @@ def exec_filepath():
 
 
 def execute(cmd, env=None):
-
     import ignite
 
     env = dict(os.environ) if env is None else env
@@ -268,7 +267,6 @@ def test_idist_parallel_no_dist():
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_idist_parallel_spawn_params_xla():
-
     res = idist.Parallel._setup_spawn_params(
         nproc_per_node=8, nnodes=None, node_rank=None, master_addr=None, master_port=None, start_method="fork"
     )
diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py
index 65498f0afe59..7845f0cd1ce0 100644
--- a/tests/ignite/distributed/utils/__init__.py
+++ b/tests/ignite/distributed/utils/__init__.py
@@ -72,7 +72,6 @@ def _test_distrib__get_max_length(device):
 
 
 def _test_distrib_all_reduce(device):
-
     res = idist.all_reduce(10)
     assert res == 10 * idist.get_world_size()
 
@@ -120,7 +119,6 @@ def _test_distrib_all_reduce(device):
 
 
 def _test_distrib_all_reduce_group(device):
-
     if idist.get_world_size() > 1 and idist.backend() is not None:
         ranks = [0, 1]
         rank = idist.get_rank()
@@ -157,51 +155,71 @@ def _test_distrib_all_reduce_group(device):
 
 
 def _test_distrib_all_gather(device):
+    rank = idist.get_rank()
+    ws = idist.get_world_size()
 
     res = torch.tensor(idist.all_gather(10), device=device)
-    true_res = torch.tensor([10] * idist.get_world_size(), device=device)
+    true_res = torch.tensor([10] * ws, device=device)
     assert (res == true_res).all()
 
-    t = torch.tensor(idist.get_rank(), device=device)
+    t = torch.tensor(rank, device=device)
     res = idist.all_gather(t)
-    true_res = torch.tensor([i for i in range(idist.get_world_size())], device=device)
+    true_res = torch.tensor([i for i in range(ws)], device=device)
     assert (res == true_res).all()
 
     x = "test-test"
-    if idist.get_rank() == 0:
+    if rank == 0:
         x = "abc"
     res = idist.all_gather(x)
-    true_res = ["abc"] + ["test-test"] * (idist.get_world_size() - 1)
+    true_res = ["abc"] + ["test-test"] * (ws - 1)
     assert res == true_res
 
     base_x = "tests/ignite/distributed/utils/test_native.py" * 2000
     x = base_x
-    if idist.get_rank() == 0:
+    if rank == 0:
         x = "abc"
 
     res = idist.all_gather(x)
-    true_res = ["abc"] + [base_x] * (idist.get_world_size() - 1)
+    true_res = ["abc"] + [base_x] * (ws - 1)
     assert res == true_res
 
-    t = torch.arange(100, device=device).reshape(4, 25) * (idist.get_rank() + 1)
+    t = torch.arange(100, device=device).reshape(4, 25) * (rank + 1)
     in_dtype = t.dtype
     res = idist.all_gather(t)
-    assert res.shape == (idist.get_world_size() * 4, 25)
+    assert res.shape == (ws * 4, 25)
     assert res.dtype == in_dtype
-    true_res = torch.zeros(idist.get_world_size() * 4, 25, device=device)
-    for i in range(idist.get_world_size()):
+    true_res = torch.zeros(ws * 4, 25, device=device)
+    for i in range(ws):
         true_res[i * 4 : (i + 1) * 4, ...] = torch.arange(100, device=device).reshape(4, 25) * (i + 1)
     assert (res == true_res).all()
 
-    if idist.get_world_size() > 1:
-        with pytest.raises(TypeError, match=r"Unhandled input type"):
-            idist.all_reduce([0, 1, 2])
+    if ws > 1 and idist.backend() != "xla-tpu":
+        t = {
+            "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)],
+            "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device),
+            "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)},
+        }
+        res = idist.all_gather(t)
+        assert isinstance(res, list) and len(res) == ws
+        for i, obj in enumerate(res):
+            assert isinstance(obj, dict)
+            assert list(obj.keys()) == ["a", "b", "c"], obj
+            expected_device = (
+                device if torch.device(device).type == "cpu" else torch.device(f"{torch.device(device).type}:{i}")
+            )
+            expected = {
+                "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)],
+                "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device),
+                "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)},
+            }
+            assert obj["a"] == expected["a"]
+            assert (obj["b"] == expected["b"]).all()
+            assert obj["c"] == expected["c"]
 
 
 def _test_distrib_all_gather_group(device):
-
     if idist.get_world_size() > 1:
-        ranks = [0, 1]
+        ranks = list(range(idist.get_world_size() - 1, 0, -1))  # [0, 1, 2, 3] -> [3, 2, 1]
         rank = idist.get_rank()
         bnd = idist.backend()
 
@@ -212,7 +230,10 @@ def _test_distrib_all_gather_group(device):
                 res = idist.all_gather(t, group=group)
         else:
             res = idist.all_gather(t, group=group)
-            assert torch.equal(res, torch.tensor(ranks, device=device))
+            if rank in ranks:
+                assert torch.equal(res, torch.tensor(ranks, device=device))
+            else:
+                assert res == t
 
         t = torch.tensor([rank], device=device)
         if bnd in ("horovod"):
@@ -220,9 +241,44 @@ def _test_distrib_all_gather_group(device):
                 res = idist.all_gather(t, group=ranks)
         else:
             res = idist.all_gather(t, group=ranks)
-            assert torch.equal(res, torch.tensor(ranks, device=device))
-
-        ranks = "abc"
+            if rank in ranks:
+                assert torch.equal(res, torch.tensor(ranks, device=device))
+            else:
+                assert res == t
+
+        t = {
+            "a": [rank + 1, rank + 2, torch.tensor(rank + 3, device=device)],
+            "b": torch.tensor([[rank + 1, rank + 2, rank + 3]], device=device),
+            "c": {"abcd": rank, "cdfg": torch.tensor(rank, dtype=torch.uint8, device=device)},
+        }
+        if bnd in ("xla-tpu"):
+            with pytest.raises(NotImplementedError, match=r"all_gather on object is not implemented for xla"):
+                res = idist.all_gather(t, group=ranks)
+        elif bnd in ("horovod"):
+            with pytest.raises(NotImplementedError, match=r"all_gather with group for horovod is not implemented"):
+                res = idist.all_gather(t, group=ranks)
+        else:
+            res = idist.all_gather(t, group=ranks)
+            if rank in ranks:
+                assert isinstance(res, list) and len(res) == len(ranks)
+                for i, obj in zip(ranks, res):
+                    assert isinstance(obj, dict)
+                    assert list(obj.keys()) == ["a", "b", "c"], obj
+                    expected_device = (
+                        device
+                        if torch.device(device).type == "cpu"
+                        else torch.device(f"{torch.device(device).type}:{i}")
+                    )
+                    expected = {
+                        "a": [i + 1, i + 2, torch.tensor(i + 3, device=expected_device)],
+                        "b": torch.tensor([[i + 1, i + 2, i + 3]], device=expected_device),
+                        "c": {"abcd": i, "cdfg": torch.tensor(i, dtype=torch.uint8, device=expected_device)},
+                    }
+                    assert obj["a"] == expected["a"], (obj, expected)
+                    assert (obj["b"] == expected["b"]).all(), (obj, expected)
+                    assert obj["c"] == expected["c"], (obj, expected)
+            else:
+                assert res == t
 
         if bnd in ("nccl", "gloo", "mpi"):
             with pytest.raises(ValueError, match=r"Argument group should be list of int or ProcessGroup"):
@@ -236,13 +292,11 @@ def _test_distrib_all_gather_group(device):
 
 
 def _test_distrib_broadcast(device):
-
     rank = idist.get_rank()
     ws = idist.get_world_size()
 
     def _test(data_src, data_others, safe_mode):
         for src in range(ws):
-
             data = data_src if rank == src else data_others
             res = idist.broadcast(data, src=src, safe_mode=safe_mode)
 
@@ -290,7 +344,6 @@ def _test(data_src, data_others, safe_mode):
 
 
 def _test_distrib_barrier(device):
-
     t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float)
     true_res = sum([i for i in range(idist.get_world_size())])
 
@@ -303,12 +356,10 @@ def _test_distrib_barrier(device):
 
 
 def _test_distrib_new_group(device):
-
     if idist.get_world_size() > 1 and idist.backend() is not None:
         bnd = idist.backend()
         ranks = [0, 1]
         if idist.has_native_dist_support and bnd in ("nccl", "gloo", "mpi"):
-
             g1 = idist.new_group(ranks)
             g2 = dist.new_group(ranks)
 
@@ -316,7 +367,6 @@ def _test_distrib_new_group(device):
             if rank in ranks:
                 assert g1.rank() == g2.rank()
         elif idist.has_xla_support and bnd in ("xla-tpu"):
-
             assert idist.new_group(ranks) == [ranks]
         elif idist.has_hvd_support and bnd in ("horovod"):
             from horovod.common.process_sets import ProcessSet
diff --git a/tests/ignite/distributed/utils/test_horovod.py b/tests/ignite/distributed/utils/test_horovod.py
index fa6c77f81cc1..ead6ed4c330e 100644
--- a/tests/ignite/distributed/utils/test_horovod.py
+++ b/tests/ignite/distributed/utils/test_horovod.py
@@ -131,7 +131,6 @@ def _test_idist_methods_in_hvd_context(backend, device):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_methods_in_hvd_context(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_idist_methods_in_hvd_context, ("horovod", device), np=np)
@@ -141,7 +140,6 @@ def test_idist_methods_in_hvd_context(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_all_reduce_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_all_reduce, (device,), np=np, do_init=True)
@@ -152,7 +150,6 @@ def test_idist_all_reduce_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist__model_methods_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib__get_max_length, (device,), np=np, do_init=True)
@@ -162,7 +159,6 @@ def test_idist__model_methods_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_all_gather_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_all_gather, (device,), np=np, do_init=True)
@@ -173,7 +169,6 @@ def test_idist_all_gather_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_broadcast_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_broadcast, (device,), np=np, do_init=True)
@@ -183,7 +178,6 @@ def test_idist_broadcast_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_barrier_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_barrier, (device,), np=np, do_init=True)
@@ -193,7 +187,6 @@ def test_idist_barrier_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_idist_new_group_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     np = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     gloo_hvd_executor(_test_distrib_new_group, (device,), np=np, do_init=True)
diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py
index b1d885da4e40..fda3e1126ccb 100644
--- a/tests/ignite/distributed/utils/test_native.py
+++ b/tests/ignite/distributed/utils/test_native.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
 import torch.distributed as dist
+from packaging.version import Version
 
 import ignite.distributed as idist
 from ignite.distributed.utils import has_native_dist_support
@@ -37,7 +38,6 @@ def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, wo
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
 def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirname, local_rank, world_size):
-
     from datetime import timedelta
 
     timeout = timedelta(seconds=20)
@@ -56,7 +56,6 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirn
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
 def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirname, local_rank, world_size):
-
     if init_method == "FILE":
         init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared"
 
@@ -81,7 +80,6 @@ def _test_native_distrib_single_node_spawn(init_method, backend, device, **kwarg
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 @pytest.mark.parametrize("init_method", [None, "tcp://0.0.0.0:22334", "FILE"])
 def test_native_distrib_single_node_spawn_gloo(init_method, dirname):
-
     from datetime import timedelta
 
     timeout = timedelta(seconds=20)
@@ -190,7 +188,6 @@ def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context_single_node_gloo):
-
     local_rank = distributed_context_single_node_gloo["local_rank"]
     device = idist.device()
     _test_idist_methods_in_native_context_set_local_rank("gloo", device, local_rank)
@@ -209,7 +206,6 @@ def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_idist__model_methods_nccl(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib__get_max_length(device)
 
@@ -217,7 +213,6 @@ def test_idist__model_methods_nccl(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist__model_methods_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib__get_max_length(device)
 
@@ -226,7 +221,6 @@ def test_idist__model_methods_gloo(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_idist_all_reduce_nccl(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_all_reduce(device)
     _test_distrib_all_reduce_group(device)
@@ -235,7 +229,6 @@ def test_idist_all_reduce_nccl(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist_all_reduce_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_all_reduce(device)
     _test_distrib_all_reduce_group(device)
@@ -244,8 +237,8 @@ def test_idist_all_reduce_gloo(distributed_context_single_node_gloo):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+@pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="dist.all_gather_object is not implemented")
 def test_idist_all_gather_nccl(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_all_gather(device)
     _test_distrib_all_gather_group(device)
@@ -253,8 +246,8 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl):
 
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
+@pytest.mark.skipif(Version(torch.__version__) < Version("1.7.0"), reason="dist.all_gather_object is not implemented")
 def test_idist_all_gather_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_all_gather(device)
     _test_distrib_all_gather_group(device)
@@ -264,7 +257,6 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_idist_broadcast_nccl(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_broadcast(device)
 
@@ -272,7 +264,6 @@ def test_idist_broadcast_nccl(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist_broadcast_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_broadcast(device)
 
@@ -281,7 +272,6 @@ def test_idist_broadcast_gloo(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_idist_barrier_nccl(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_barrier(device)
 
@@ -289,7 +279,6 @@ def test_idist_barrier_nccl(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist_barrier_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_barrier(device)
 
@@ -356,7 +345,6 @@ def test_idist_methods_overhead_nccl(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_one_rank_only(device=device)
     _test_distrib_one_rank_only_with_engine(device=device)
@@ -366,7 +354,48 @@ def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_idist_one_rank_only_nccl(local_rank, distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_one_rank_only(device=device)
     _test_distrib_one_rank_only_with_engine(device=device)
+
+
+@pytest.mark.distributed
+@pytest.mark.parametrize("rank", range(int(os.environ.get("WORLD_SIZE", 1))))
+@pytest.mark.parametrize("local", [True, False])
+def test_one_rank_first(distributed, get_rank_zero_dirname, rank, local):
+    def get_ds(file_path):
+        rank = idist.get_local_rank() if local else idist.get_rank()
+        if not file_path.exists():
+            with open(file_path, "w") as f:
+                f.write("readed")
+            return f"{rank} not readed"
+        else:
+            return f"{rank} readed"
+
+    folder = get_rank_zero_dirname()
+    file_path = folder / "res.txt"
+
+    with idist.one_rank_first(rank, local=local):
+        x = get_ds(file_path)
+
+    output = idist.all_gather(x)
+
+    if local:
+        expected = [
+            f"{x} not readed" if x == rank else f"{x} readed" for x in range(idist.get_nproc_per_node())
+        ] * idist.get_nnodes()
+    else:
+        expected = [f"{x} not readed" if x == rank else f"{x} readed" for x in range(idist.get_world_size())]
+
+    print("expected:", expected, idist.get_nnodes())
+    assert set(expected) == set(output)
+
+
+@pytest.mark.distributed
+def test_one_rank_first_asserts():
+    rank = 100
+    with pytest.raises(
+        ValueError, match=f"rank should be between 0 and {idist.get_world_size() - 1}, but given {rank}"
+    ):
+        with idist.one_rank_first(rank):
+            pass
diff --git a/tests/ignite/distributed/utils/test_serial.py b/tests/ignite/distributed/utils/test_serial.py
index afae86b78f39..1fee2bb8ce1d 100644
--- a/tests/ignite/distributed/utils/test_serial.py
+++ b/tests/ignite/distributed/utils/test_serial.py
@@ -14,7 +14,6 @@
 
 
 def test_no_distrib(capsys):
-
     assert idist.backend() is None
     if torch.cuda.is_available():
         assert idist.device().type == "cuda"
diff --git a/tests/ignite/distributed/utils/test_xla.py b/tests/ignite/distributed/utils/test_xla.py
index 281e1ba50d81..bb109eacdea9 100644
--- a/tests/ignite/distributed/utils/test_xla.py
+++ b/tests/ignite/distributed/utils/test_xla.py
@@ -148,7 +148,6 @@ def test_idist_new_group_xla():
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_idist_all_gather_xla():
-
     device = idist.device()
     _test_distrib_all_gather(device)
     _test_distrib_all_gather_group(device)
@@ -172,7 +171,6 @@ def test_idist_all_gather_xla_in_child_proc(xmp_executor):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_idist_broadcast_xla():
-
     device = idist.device()
     _test_distrib_broadcast(device)
 
@@ -194,7 +192,6 @@ def test_idist_broadcast_xla_in_child_proc(xmp_executor):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_idist_barrier_xla():
-
     device = idist.device()
     _test_distrib_barrier(device)
 
@@ -216,7 +213,6 @@ def test_idist_barrier_xla_in_child_proc(xmp_executor):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_idist_one_rank_only_xla():
-
     device = idist.device()
     _test_distrib_one_rank_only(device=device)
     _test_distrib_one_rank_only_with_engine(device=device)
diff --git a/tests/ignite/engine/__init__.py b/tests/ignite/engine/__init__.py
index d863b60b67ef..98059e98518c 100644
--- a/tests/ignite/engine/__init__.py
+++ b/tests/ignite/engine/__init__.py
@@ -5,7 +5,6 @@
 except ImportError:
 
     class IterableDataset:
-
         pass
 
 
diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py
index fa9681df81e0..8d001a8d2cc1 100644
--- a/tests/ignite/engine/test_create_supervised.py
+++ b/tests/ignite/engine/test_create_supervised.py
@@ -31,11 +31,13 @@ def __init__(self, output_as_list=False):
         self.output_as_list = output_as_list
         self.fc = torch.nn.Linear(1, 1, bias=False)
 
-    def forward(self, x):
+    def forward(self, x, bias=None):
+        if bias is None:
+            bias = 0.0
         if self.output_as_list:
-            return self.fc(x), self.fc(x)
+            return self.fc(x) + bias, self.fc(x) + bias
 
-        return self.fc(x)
+        return self.fc(x) + bias
 
 
 def _default_create_supervised_trainer(
@@ -46,6 +48,7 @@ def _default_create_supervised_trainer(
     amp_mode: str = None,
     scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
     with_model_transform: bool = False,
+    with_model_fn: bool = False,
 ):
     if with_model_transform:
 
@@ -65,8 +68,8 @@ def get_first_element(output):
     optimizer = SGD(model.parameters(), 0.1)
 
     if trace:
-        example_input = torch.randn(1)
-        model = torch.jit.trace(model, example_input)
+        example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1)
+        model = torch.jit.trace(model, example_inputs)
 
     if amp_mode == "apex" and model_device == trainer_device == "cuda":
         from apex import amp
@@ -83,6 +86,9 @@ def get_first_element(output):
         scaler=scaler,
         gradient_accumulation_steps=gradient_accumulation_steps,
         model_transform=model_transform if model_transform is not None else lambda x: x,
+        model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device)))
+        if with_model_fn
+        else (lambda model, x: model(x)),
     )
     assert model.fc.weight.data[0, 0].item() == approx(0.0)
     return trainer, model
@@ -96,6 +102,7 @@ def _test_create_supervised_trainer(
     amp_mode: str = None,
     scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
     with_model_transform: bool = False,
+    with_model_fn: bool = False,
 ):
     trainer, model = _default_create_supervised_trainer(
         gradient_accumulation_steps=gradient_accumulation_steps,
@@ -105,10 +112,13 @@ def _test_create_supervised_trainer(
         amp_mode=amp_mode,
         scaler=scaler,
         with_model_transform=with_model_transform,
+        with_model_fn=with_model_fn,
     )
 
     x = torch.tensor([[0.01], [0.02], [0.03], [0.04], [0.05]])
     y = torch.tensor([[0.015], [0.025], [0.035], [0.045], [0.055]])
+    if with_model_fn:
+        y += 0.01
     data = [(_x, _y) for _x, _y in zip(x, y)]
 
     theta = [0.0]
@@ -120,12 +130,14 @@ def _():
         assert model.fc.weight.grad != 0
         _x, _y = trainer.state.batch
         _x, _y = _x.to(model_device), _y.to(model_device)
-        accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - _y.item())
+        bias = 0.01 if with_model_fn else 0.0
+        accumulation[0] += 0.2 * _x.item() * (theta[0] * _x.item() - (_y.item() - bias))
         # value of loss should not be accumulated
+        _y_pred = model(_x, torch.tensor([bias], device=model_device)) if with_model_fn else model(_x)
         if with_model_transform:
-            loss[0] = mse_loss(model(_x)[0], _y).item()
-        else:
-            loss[0] = mse_loss(model(_x), _y).item()
+            _y_pred = _y_pred[0]
+
+        loss[0] = mse_loss(_y_pred, _y).item()
 
     @trainer.on(Events.ITERATION_COMPLETED(every=gradient_accumulation_steps))
     def _():
@@ -135,7 +147,6 @@ def _():
         accumulation[0] = loss[0] = 0.0
 
     if model_device == trainer_device or ((model_device == "cpu") ^ (trainer_device == "cpu")):
-
         state = trainer.run(data)
 
         if amp_mode == "amp":
@@ -154,7 +165,6 @@ def _():
 
 @pytest.mark.skipif(Version(torch.__version__) < Version("1.6.0"), reason="Skip if < 1.6.0")
 def test_create_supervised_training_scalar_assignment():
-
     with mock.patch("ignite.engine._check_arg") as check_arg_mock:
         check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False)
         trainer, _ = _default_create_supervised_trainer(model_device="cpu", trainer_device="cpu", scaler=True)
@@ -173,7 +183,6 @@ def _test_create_mocked_supervised_trainer(
         with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock:
             with mock.patch("ignite.engine.supervised_training_step_tpu") as training_step_tpu_mock:
                 with mock.patch("ignite.engine.supervised_training_step") as training_step_mock:
-
                     trainer, _ = _default_create_supervised_trainer(
                         model_device=model_device,
                         trainer_device=trainer_device,
@@ -221,6 +230,7 @@ def _default_create_supervised_evaluator(
     trace: bool = False,
     amp_mode: str = None,
     with_model_transform: bool = False,
+    with_model_fn: bool = False,
 ):
     if with_model_transform:
 
@@ -239,14 +249,17 @@ def get_first_element(output):
     model.fc.weight.data.zero_()
 
     if trace:
-        example_input = torch.randn(1, 1)
-        model = torch.jit.trace(model, example_input)
+        example_inputs = (torch.randn(1), torch.randn(1)) if with_model_fn else torch.randn(1)
+        model = torch.jit.trace(model, example_inputs)
 
     evaluator = create_supervised_evaluator(
         model,
         device=evaluator_device,
         amp_mode=amp_mode,
         model_transform=model_transform if model_transform is not None else lambda x: x,
+        model_fn=(lambda model, x: model(x, torch.tensor([0.01], device=model_device)))
+        if with_model_fn
+        else (lambda model, x: model(x)),
     )
 
     assert model.fc.weight.data[0, 0].item() == approx(0.0)
@@ -260,6 +273,7 @@ def _test_create_supervised_evaluator(
     trace: bool = False,
     amp_mode: str = None,
     with_model_transform: bool = False,
+    with_model_fn: bool = False,
 ):
     model, evaluator = _default_create_supervised_evaluator(
         model_device=model_device,
@@ -267,16 +281,21 @@ def _test_create_supervised_evaluator(
         trace=trace,
         amp_mode=amp_mode,
         with_model_transform=with_model_transform,
+        with_model_fn=with_model_fn,
     )
     x = torch.tensor([[1.0], [2.0]])
     y = torch.tensor([[3.0], [5.0]])
+    if with_model_fn:
+        y += 0.01
     data = [(x, y)]
 
     if model_device == evaluator_device or ((model_device == "cpu") ^ (evaluator_device == "cpu")):
         state = evaluator.run(data)
 
         y_pred, y = state.output
-
+        if with_model_fn:
+            y_pred -= 0.01
+            y -= 0.01
         assert y_pred[0, 0].item() == approx(0.0)
         assert y_pred[1, 0].item() == approx(0.0)
         assert y[0, 0].item() == approx(3.0)
@@ -325,7 +344,6 @@ def _test_create_evaluation_step_amp(
     trace: bool = False,
     amp_mode: str = None,
 ):
-
     output_transform_mock = MagicMock()
     model = DummyModel()
 
@@ -396,6 +414,7 @@ def test_create_supervised_trainer(trainer_device, trace):
     _test_create_supervised_trainer(gradient_accumulation_steps=1, trainer_device=trainer_device, trace=trace)
     _test_create_supervised_trainer(gradient_accumulation_steps=3, trainer_device=trainer_device, trace=trace)
     _test_create_supervised_trainer(with_model_transform=True, trainer_device=trainer_device, trace=trace)
+    _test_create_supervised_trainer(with_model_fn=True, trainer_device=trainer_device, trace=trace)
     _test_create_mocked_supervised_trainer(trainer_device=trainer_device, trace=trace)
 
 
@@ -580,6 +599,8 @@ def test_create_supervised_trainer_on_cuda_with_model_on_cpu():
 
 def test_create_supervised_evaluator():
     _test_create_supervised_evaluator()
+    _test_create_supervised_evaluator(with_model_transform=True)
+    _test_create_supervised_evaluator(with_model_fn=True)
     _test_mocked_supervised_evaluator()
 
     # older versions didn't have the autocast method so we skip the test for older builds
diff --git a/tests/ignite/engine/test_custom_events.py b/tests/ignite/engine/test_custom_events.py
index 3a19904a45f9..c4400396bf19 100644
--- a/tests/ignite/engine/test_custom_events.py
+++ b/tests/ignite/engine/test_custom_events.py
@@ -6,7 +6,24 @@
 
 import ignite.distributed as idist
 from ignite.engine import Engine, Events
-from ignite.engine.events import CallableEventWithFilter, EventEnum, EventsList
+from ignite.engine.events import CallableEvents, CallableEventWithFilter, EventEnum, EventsList
+
+
+def test_deprecated_callable_events_class():
+    engine = Engine(lambda engine, batch: 0)
+
+    with pytest.warns(DeprecationWarning, match=r"Class ignite\.engine\.events\.CallableEvents is deprecated"):
+
+        class CustomEvents(CallableEvents, Enum):
+            TEST_EVENT = "test_event"
+
+            def __new__(cls, value: str) -> "CallableEvents":
+                obj = CallableEvents.__new__(cls)
+                obj._value_ = value
+                return obj
+
+        with pytest.raises(TypeError, match=r"Value at \d of event_names should be a str or EventEnum"):
+            engine.register_events(*CustomEvents)
 
 
 def test_custom_events():
@@ -234,7 +251,6 @@ def ef(e, i):
     ],
 )
 def test_callable_events(event):
-
     assert isinstance(event.value, str)
 
     def foo(engine, _):
@@ -296,7 +312,6 @@ def bar(e):
 
 
 def test_remove_event_handler_on_callable_events():
-
     engine = Engine(lambda e, b: 1)
 
     def foo(e):
@@ -324,11 +339,9 @@ def bar(e):
 
 
 def _test_every_event_filter_with_engine(device="cpu"):
-
     data = torch.rand(100, 4, device=device)
 
     def _test(event_name, event_attr, every, true_num_calls):
-
         engine = Engine(lambda e, b: b)
 
         counter = [0]
@@ -382,7 +395,6 @@ def test_every_event_filter_with_engine():
     ],
 )
 def test_before_event_filter_with_engine(event_name, event_attr, before, expect_calls):
-
     data = range(100)
 
     engine = Engine(lambda e, b: 1)
@@ -410,7 +422,6 @@ def _before_event():
     ],
 )
 def test_after_event_filter_with_engine(event_name, event_attr, after, expect_calls):
-
     data = range(100)
 
     engine = Engine(lambda e, b: 1)
@@ -431,7 +442,6 @@ def _after_event():
     [(Events.ITERATION_STARTED, "iteration", 300, 100, 199), (Events.EPOCH_COMPLETED, "epoch", 4, 1, 2)],
 )
 def test_before_and_after_event_filter_with_engine(event_name, event_attr, before, after, expect_calls):
-
     data = range(100)
 
     engine = Engine(lambda e, b: 1)
@@ -452,7 +462,6 @@ def _before_and_after_event():
     [(Events.ITERATION_STARTED, "iteration", 5, 25, 8, 4), (Events.EPOCH_COMPLETED, "epoch", 2, 5, 1, 2)],
 )
 def test_every_before_and_after_event_filter_with_engine(event_name, event_attr, every, before, after, expect_calls):
-
     data = range(100)
 
     engine = Engine(lambda e, b: 1)
@@ -484,7 +493,6 @@ def _every_before_and_after_event():
     ],
 )
 def test_once_event_filter(event_name, event_attr, once, expect_calls):
-
     data = list(range(100))
 
     engine = Engine(lambda e, b: b)
@@ -508,7 +516,6 @@ def assert_(engine):
 
 
 def test_custom_event_filter_with_engine():
-
     special_events = [1, 2, 5, 7, 17, 20]
 
     def custom_event_filter(engine, event):
@@ -517,7 +524,6 @@ def custom_event_filter(engine, event):
         return False
 
     def _test(event_name, event_attr, true_num_calls):
-
         engine = Engine(lambda e, b: b)
 
         num_calls = [0]
@@ -539,7 +545,6 @@ def assert_on_special_event(engine):
 
 
 def test_callable_event_bad_behaviour():
-
     special_events = [1, 2, 5, 7, 17, 20]
 
     def custom_event_filter(engine, event):
@@ -660,7 +665,6 @@ def test_every_event_filter_with_engine_with_dataloader():
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_every_event_filter_with_engine(device)
     _test_every_event_filter_with_engine_with_dataloader(device)
@@ -670,14 +674,12 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_every_event_filter_with_engine(device)
     _test_every_event_filter_with_engine_with_dataloader(device)
 
 
 def test_event_list():
-
     e1 = Events.ITERATION_STARTED(once=1)
     e2 = Events.ITERATION_STARTED(every=3)
     e3 = Events.COMPLETED
@@ -693,7 +695,6 @@ def test_event_list():
 
 def test_list_of_events():
     def _test(event_list, true_iterations):
-
         engine = Engine(lambda e, b: b)
 
         iterations = []
diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py
index 36a4a4371673..b2f62dfa111e 100644
--- a/tests/ignite/engine/test_deterministic.py
+++ b/tests/ignite/engine/test_deterministic.py
@@ -95,7 +95,6 @@ def test_reproducible_batch_sampler_wrong_input():
 
 
 def test_reproducible_batch_sampler():
-
     data = list(range(100))
     dataloader = DataLoader(data, batch_size=12, num_workers=0, shuffle=True, drop_last=True)
 
@@ -125,7 +124,6 @@ def test_reproducible_batch_sampler():
 
 
 def _test_keep_random_state(with_numpy):
-
     manual_seed(54)
     true_values = []
     for _ in range(5):
@@ -175,7 +173,6 @@ def test_keep_random_state_without_numpy():
 
 def test_strict_resume_from_iter():
     def _test(epoch_length=None):
-
         max_epochs = 5
         num_iters = 21
         torch.manual_seed(0)
@@ -244,7 +241,6 @@ def update_fn(_, batch):
 
 def _test_resume_random_dataloader_from_epoch(device, _setup_sampler, sampler_type=None):
     def _test(epoch_length=None):
-
         max_epochs = 5
         total_batch_size = 4
         num_iters = 21
@@ -255,7 +251,6 @@ def _test(epoch_length=None):
             epoch_length = num_iters
 
         for resume_epoch in range(1, max_epochs, 2):
-
             for num_workers in [0, 2]:
                 sampler, batch_size = _setup_sampler(sampler_type, num_iters, total_batch_size)
 
@@ -361,9 +356,7 @@ def _test(epoch_length=None):
             epoch_length = num_iters
 
         for resume_iteration in range(2, min(num_iters * max_epochs, epoch_length * max_epochs), 13):
-
             for num_workers in [0, 2]:
-
                 sampler, batch_size = _setup_sampler(sampler_type, num_iters, total_batch_size)
                 orig_dataloader = DataLoader(
                     data,
@@ -513,7 +506,6 @@ def infinite_data_iterator():
             epoch_length = num_iters
 
         for resume_iteration in range(1, min(num_iters * max_epochs, epoch_length * max_epochs), 7):
-
             seen_batchs = []
 
             def update_fn(_, batch):
@@ -557,7 +549,6 @@ def test_resume_random_data_iterator_from_iter():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed")
     _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed")
@@ -566,7 +557,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed")
     _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed")
@@ -577,7 +567,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed")
     _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed")
@@ -587,14 +576,12 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed")
     _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed")
 
 
 def test_concepts_snippet_resume():
-
     # Commented imports required in the snippet
     # import torch
     # from torch.utils.data import DataLoader
@@ -659,7 +646,6 @@ def user_handler(_):
 def _test_gradients_on_resume(
     dirname, device, with_dropout=True, with_dataaugs=True, data_size=24, batch_size=4, save_iter=None, save_epoch=None
 ):
-
     debug = False
 
     def random_train_data_loader(size):
diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py
index 994eb49f72bb..c09d13d88ee3 100644
--- a/tests/ignite/engine/test_engine.py
+++ b/tests/ignite/engine/test_engine.py
@@ -437,7 +437,6 @@ def update_fn(_, batch):
         _test(data)
 
     def test_state_repr(self):
-
         data = [0, 1, 2, 3, 4, 5]
         max_epochs = 1
         metrics = {"accuracy": Mock()}
@@ -452,7 +451,6 @@ def test_state_repr(self):
         assert "batch" in s
 
     def test_alter_batch(self):
-
         small_shape = (1, 2, 2)
         large_shape = (1, 3, 3)
 
@@ -520,6 +518,9 @@ def test_run_asserts(self):
         with pytest.raises(ValueError, match=r"Input data has zero size. Please provide non-empty data"):
             engine.run([])
 
+        with pytest.warns(UserWarning, match="Argument seed is deprecated"):
+            engine.run([0, 1, 2, 3, 4], seed=1234)
+
     def test_state_get_event_attrib_value(self):
         state = State()
         state.iteration = 10
@@ -1026,47 +1027,6 @@ def switch_dataloader():
 
         trainer.run(data1, max_epochs=10)
 
-    def test_run_with_max_iters(self):
-        max_iters = 8
-        engine = Engine(lambda e, b: 1)
-        engine.run([0] * 20, max_iters=max_iters)
-        assert engine.state.iteration == max_iters
-        assert engine.state.max_iters == max_iters
-
-    def test_run_with_max_iters_greater_than_epoch_length(self):
-        max_iters = 73
-        engine = Engine(lambda e, b: 1)
-        engine.run([0] * 20, max_iters=max_iters)
-        assert engine.state.iteration == max_iters
-
-    def test_run_with_invalid_max_iters_and_max_epoch(self):
-        max_iters = 12
-        max_epochs = 2
-        engine = Engine(lambda e, b: 1)
-        with pytest.raises(
-            ValueError,
-            match=r"Arguments max_iters and max_epochs are mutually exclusive."
-            "Please provide only max_epochs or max_iters.",
-        ):
-            engine.run([0] * 20, max_iters=max_iters, max_epochs=max_epochs)
-
-    def test_epoch_events_fired_max_iters(self):
-        max_iters = 32
-        engine = Engine(lambda e, b: 1)
-
-        @engine.on(Events.EPOCH_COMPLETED)
-        def fired_event(engine):
-            assert engine.state.iteration % engine.state.epoch_length == 0
-
-        engine.run([0] * 10, max_iters=max_iters)
-
-    def test_is_done_with_max_iters(self):
-        state = State(iteration=100, epoch=1, max_epochs=3, epoch_length=100, max_iters=250)
-        assert not Engine._is_done(state)
-
-        state = State(iteration=250, epoch=1, max_epochs=3, epoch_length=100, max_iters=250)
-        assert Engine._is_done(state)
-
     @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
     def test_batch_is_released_before_new_one_is_loaded_on_cuda(self):
         torch.cuda.empty_cache()
diff --git a/tests/ignite/engine/test_engine_state_dict.py b/tests/ignite/engine/test_engine_state_dict.py
index ce8e5aba0d48..4ccfb7ea7720 100644
--- a/tests/ignite/engine/test_engine_state_dict.py
+++ b/tests/ignite/engine/test_engine_state_dict.py
@@ -131,7 +131,6 @@ def test_load_state_dict_integration():
 
 
 def test_load_state_dict_with_params_overriding_integration():
-
     state_dict = {"max_epochs": 100, "epoch_length": 120, "epoch": 5}
     data = range(120)
 
@@ -205,7 +204,6 @@ def save_engine(_):
 
 def test_epoch_length():
     def _test(data, max_epochs, num_iters):
-
         batch_checker = BatchChecker(data)
 
         def update_fn(_, batch):
@@ -219,7 +217,6 @@ def update_fn(_, batch):
         assert engine.state.epoch == max_epochs
 
     def _test_as_iter(data, max_epochs, num_iters):
-
         batch_checker = BatchChecker(data)
 
         def update_fn(_, batch):
diff --git a/tests/ignite/engine/test_event_handlers.py b/tests/ignite/engine/test_event_handlers.py
index d3f4625604e4..1d0d71646a87 100644
--- a/tests/ignite/engine/test_event_handlers.py
+++ b/tests/ignite/engine/test_event_handlers.py
@@ -139,7 +139,6 @@ def test_adding_multiple_event_handlers():
     ],
 )
 def test_event_removable_handle(event1, event2):
-
     # Removable handle removes event from engine.
     engine = Engine(lambda e, b: None)
     handler = create_autospec(spec=lambda x: None)
@@ -227,7 +226,6 @@ def _handler(_):
 
 
 def test_events_list_removable_handle():
-
     # Removable handle removes event from engine.
     engine = DummyEngine()
     handler = create_autospec(spec=lambda x: None)
@@ -495,7 +493,6 @@ def __call__(self, engine, e):
 
 
 def test_event_handlers_with_decoration():
-
     engine = Engine(lambda e, b: b)
 
     def decorated(fun):
diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py
index bf48e07399d2..05f2f9fc8cce 100644
--- a/tests/ignite/handlers/test_checkpoint.py
+++ b/tests/ignite/handlers/test_checkpoint.py
@@ -45,13 +45,15 @@ def forward(self, x):
 
 
 def test_checkpoint_wrong_input():
-
     with pytest.raises(TypeError, match=r"Argument `to_save` should be a dictionary"):
         Checkpoint(12, lambda x: x, "prefix")
 
     with pytest.raises(TypeError, match=r"Argument `to_save` should be a dictionary"):
         Checkpoint([12], lambda x: x, "prefix")
 
+    with pytest.raises(TypeError, match=r"should have `state_dict`"):
+        Checkpoint({"model": {"abc": 12}}, lambda x: x, "prefix")
+
     to_save = {"model": model}
 
     with pytest.raises(
@@ -63,25 +65,34 @@ def test_checkpoint_wrong_input():
     with pytest.raises(TypeError, match=r"global_step_transform should be a function."):
         Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", global_step_transform=123)
 
+    with pytest.warns(UserWarning, match=r"Argument archived is deprecated"):
+        Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", archived=True)
+
     with pytest.raises(ValueError, match=r"Cannot have key 'checkpointer' if `include_self` is True"):
         Checkpoint({"checkpointer": model}, lambda x: x, include_self=True)
 
     class ImmutableMapping(Mapping):
+        def __init__(self, d):
+            self._dict = d
+
         def __getitem__(self, key):
-            return to_save[key]
+            return self._dict[key]
 
         def __iter__(self):
-            return iter(to_save)
+            return iter(self._dict)
 
         def __len__(self):
-            return len(to_save)
+            return len(self._dict)
 
     with pytest.raises(TypeError, match="If `include_self` is True, then `to_save` must be mutable"):
-        Checkpoint(ImmutableMapping(), lambda x: x, include_self=True)
+        Checkpoint(ImmutableMapping(to_save), lambda x: x, include_self=True)
 
+    checkpoint = Checkpoint(to_save, lambda x: x)
+    with pytest.raises(AttributeError, match="Checkpoint's `save_handler` should be of type `DiskSaver`"):
+        checkpoint.reload_objects(to_save)
 
-def test_save_handler_as_str(dirname):
 
+def test_save_handler_as_str(dirname):
     to_save = {"model": model}
 
     checkpointer = Checkpoint(to_save, save_handler=dirname)
@@ -89,7 +100,6 @@ def test_save_handler_as_str(dirname):
 
 
 def test_checkpoint_score_function_wrong_output():
-
     to_save = {"model": model}
 
     checkpointer = Checkpoint(to_save, lambda x: x, score_function=lambda e: {"1": 1}, score_name="acc")
@@ -160,7 +170,7 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name):
     assert save_handler.call_count == 1
 
     fname = f"{name}_0.pt"
-    obj["checkpointer"] = OrderedDict([("saved", [(0, fname)])])
+    obj["checkpointer"] = OrderedDict([("_saved", [(0, fname)])])
 
     metadata = {"basename": name, "score_name": None, "priority": 0}
     save_handler.assert_called_with(obj, fname, metadata)
@@ -180,7 +190,7 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name):
     save_handler.remove.assert_called_with(f"{name}_0.pt")
 
     fname = f"{name}_1234.pt"
-    obj["checkpointer"] = OrderedDict([("saved", [(1234, fname)])])
+    obj["checkpointer"] = OrderedDict([("_saved", [(1234, fname)])])
 
     save_handler.assert_called_with(obj, fname, metadata)
     assert save_handler.remove.call_count == 1
@@ -188,7 +198,6 @@ def test_checkpoint_include_self_state_dict(to_save, obj, name):
 
 
 def test_checkpoint_with_dp():
-
     dp_model = nn.DataParallel(model)
     to_save = {"model": dp_model}
 
@@ -550,12 +559,21 @@ def test_model_checkpoint_args_validation(dirname):
     with pytest.raises(ValueError, match=r"with extension '.pt' are already present "):
         ModelCheckpoint(nonempty, _PREFIX)
 
+    with pytest.raises(ValueError, match=r"Argument save_interval is deprecated and should be None"):
+        ModelCheckpoint(existing, _PREFIX, save_interval=42)
+
     with pytest.raises(ValueError, match=r"Directory path '\S+' is not found"):
         ModelCheckpoint(dirname / "non_existing_dir", _PREFIX, create_dir=False)
 
+    with pytest.raises(ValueError, match=r"Argument save_as_state_dict is deprecated and should be True"):
+        ModelCheckpoint(existing, _PREFIX, create_dir=False, save_as_state_dict=False)
+
     with pytest.raises(TypeError, match=r"global_step_transform should be a function"):
         ModelCheckpoint(existing, _PREFIX, create_dir=False, global_step_transform=1234)
 
+    with pytest.warns(UserWarning, match=r"Argument archived is deprecated"):
+        ModelCheckpoint(existing, _PREFIX, create_dir=False, archived=True)
+
     h = ModelCheckpoint(dirname, _PREFIX, create_dir=False)
     assert h.last_checkpoint is None
     with pytest.raises(RuntimeError, match=r"No objects to checkpoint found."):
@@ -584,7 +602,6 @@ def test_model_checkpoint_simple_recovery(dirname):
 
 @pytest.mark.parametrize("ext, require_empty", [(".txt", True), (".pt", False)])
 def test_model_checkpoint_simple_recovery_from_existing_non_empty(ext, require_empty, dirname):
-
     previous_fname = dirname / f"{_PREFIX}_obj_{1}{ext}"
     with open(previous_fname, "w") as f:
         f.write("test")
@@ -624,13 +641,11 @@ def test_model_checkpoint_invalid_save_handler(dirname):
 
 
 def test_disk_saver_atomic(dirname):
-
     model = DummyModel()
     to_save_serializable = {"model": model}
     to_save_non_serializable = {"model": lambda x: x}
 
     def _test_existence(atomic, _to_save, expected):
-
         saver = DiskSaver(dirname, atomic=atomic, create_dir=False, require_empty=False)
         fname = "test.pt"
         try:
@@ -685,7 +700,6 @@ def test_disk_saver_unknown_keyword(dirname):
 
 
 def test_last_k(dirname):
-
     h = ModelCheckpoint(dirname, _PREFIX, create_dir=False, n_saved=2)
     engine = Engine(lambda e, b: None)
     engine.state = State(epoch=0, iteration=0)
@@ -704,7 +718,6 @@ def test_last_k(dirname):
 
 
 def test_disabled_n_saved(dirname):
-
     h = ModelCheckpoint(dirname, _PREFIX, create_dir=False, n_saved=None)
     engine = Engine(lambda e, b: None)
     engine.state = State(epoch=0, iteration=0)
@@ -857,7 +870,6 @@ def test_valid_state_dict_save(dirname):
 
 
 def _test_save_model_optimizer_lr_scheduler_with_state_dict(device, dirname, just_on_zero_rank=False):
-
     torch.manual_seed(23)
 
     model = DummyModel().to(device)
@@ -873,7 +885,6 @@ def update_fn(engine, batch):
         # Probably related to https://github.com/pytorch/xla/issues/2576
         # loss = y.pow(2.0).sum()
         loss = y.sum()
-        print(loss.device, y.device, x.device)
         loss.backward()
         if idist.has_xla_support:
             import torch_xla.core.xla_model as xm
@@ -946,7 +957,6 @@ def _test_save_model_optimizer_lr_scheduler_with_validation(device, dirname, jus
     torch.manual_seed(23)
 
     def _build_objects(acc_list):
-
         model = DummyModel().to(device)
         optim = torch.optim.SGD(model.parameters(), lr=0.1)
         lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)
@@ -1075,17 +1085,19 @@ def test_save_model_optimizer_lr_scheduler_with_validation(dirname):
 
 
 def test_checkpoint_load_objects():
-
     with pytest.raises(TypeError, match=r"Argument checkpoint should be a string or a dictionary"):
         Checkpoint.load_objects({}, [])
 
     with pytest.raises(TypeError, match=r"should have `load_state_dict` method"):
         Checkpoint.load_objects({"a": None}, {"a": None})
 
+    with pytest.raises(TypeError, match=r"should have `load_state_dict` method"):
+        Checkpoint.load_objects({"a": {"b": None}}, {"a": {"b": None}})
+
     model = DummyModel()
     to_load = {"model": model, "another_model": model}
 
-    with pytest.raises(ValueError, match=r"from `to_load` is not found in the checkpoint"):
+    with pytest.raises(ValueError, match=r"Key 'model' from x is not found in y"):
         Checkpoint.load_objects(to_load, {})
 
     model = DummyModel()
@@ -1096,6 +1108,11 @@ def test_checkpoint_load_objects():
     Checkpoint.load_objects(to_load, chkpt)
     assert model.state_dict() == model2.state_dict()
 
+    chkpt = {"models": [{"model1": {"abc": model.state_dict()}}, model.state_dict()]}
+    to_load = {"models": [{"model1": {"abc": model}}, model]}
+    Checkpoint.load_objects(to_load, chkpt)
+    assert model.state_dict() == model2.state_dict()
+
 
 def test_checkpoint_load_objects_from_saved_file(dirname):
     def _get_single_obj_to_save():
@@ -1107,7 +1124,11 @@ def _get_multiple_objs_to_save():
         model = DummyModel()
         optim = torch.optim.SGD(model.parameters(), lr=0.001)
         lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5)
-        to_save = {"model": model, "optimizer": optim, "lr_scheduler": lr_scheduler}
+        to_save = {
+            "model": model,
+            "optimizer": optim,
+            "lr_scheduler": lr_scheduler,
+        }
         return to_save
 
     trainer = Engine(lambda e, b: None)
@@ -1181,9 +1202,7 @@ def test_load_checkpoint_with_different_num_classes(dirname):
     with pytest.raises(RuntimeError):
         Checkpoint.load_objects(to_load_single_object, loaded_checkpoint)
 
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", category=UserWarning)
-        Checkpoint.load_objects(to_load_single_object, loaded_checkpoint, strict=False, blah="blah")
+    Checkpoint.load_objects(to_load_single_object, loaded_checkpoint, strict=False)
 
     loaded_weights = to_load_single_object["pretrained_features"].state_dict()["weight"]
 
@@ -1191,7 +1210,6 @@ def test_load_checkpoint_with_different_num_classes(dirname):
 
 
 def test_disksaver_wrong_input(dirname):
-
     with pytest.raises(ValueError, match=r"Directory path '\S+' is not found"):
         DiskSaver("/tmp/non-existing-folder", create_dir=False)
 
@@ -1244,7 +1262,6 @@ def _test_checkpoint_load_objects_ddp(device):
 
 
 def _test_checkpoint_with_ZeRO(device, dirname, local_rank):
-
     from torch.distributed.optim import ZeroRedundancyOptimizer
 
     model = DummyModel().to(device)
@@ -1265,7 +1282,6 @@ def _test_checkpoint_with_ZeRO(device, dirname, local_rank):
     mocked_opt.consolidate_state_dict.assert_called_once_with(to=1)
 
     if local_rank == 1:
-
         loaded_state_dict = torch.load(dirname / "checkpoint_0.pt", map_location=device)["optim"]
         state_dict = opt.state_dict()
         assert loaded_state_dict == state_dict
@@ -1274,7 +1290,6 @@ def _test_checkpoint_with_ZeRO(device, dirname, local_rank):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, dirname, get_rank_zero_dirname, local_rank):
-
     device = idist.device()
     rank_zero_dirname = get_rank_zero_dirname()
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device, rank_zero_dirname / "1")
@@ -1292,7 +1307,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, dirname,
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname):
-
     device = idist.device()
     dirname = get_rank_zero_dirname()
     _test_save_model_optimizer_lr_scheduler_with_state_dict(device, dirname / "1")
@@ -1305,7 +1319,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_di
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor, get_rank_zero_dirname):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
     dirname = get_rank_zero_dirname()
@@ -1430,7 +1443,6 @@ def _test_model_checkpoint_filename_pattern_helper(
 
 @pytest.mark.parametrize("test_class", ["checkpoint", "model_checkpoint"])
 def test_checkpoint_filename_pattern(test_class, dirname):
-
     if test_class == "checkpoint":
         _test = _test_checkpoint_filename_pattern_helper
     elif test_class == "model_checkpoint":
@@ -1627,10 +1639,10 @@ def _setup_checkpoint():
 def test_checkpoint_state_dict():
     checkpointer = _setup_checkpoint()
     sd = checkpointer.state_dict()
-    assert "saved" in sd
-    assert isinstance(sd["saved"], list) and len(sd["saved"]) == len(checkpointer._saved)
+    assert "_saved" in sd
+    assert isinstance(sd["_saved"], list) and len(sd["_saved"]) == len(checkpointer._saved)
 
-    for saved_item, true_item in zip(sd["saved"], checkpointer._saved):
+    for saved_item, true_item in zip(sd["_saved"], checkpointer._saved):
         assert saved_item[0] == true_item.priority
         assert saved_item[1] == true_item.filename
 
@@ -1643,11 +1655,43 @@ def test_checkpoint_load_state_dict():
     to_save = {"model": model}
     checkpointer = Checkpoint(to_save, save_handler=save_handler, n_saved=None)
 
-    sd = {"saved": [(0, "model_0.pt"), (10, "model_10.pt"), (20, "model_20.pt")]}
+    sd = {"_saved": [(0, "model_0.pt"), (10, "model_10.pt"), (20, "model_20.pt")]}
     checkpointer.load_state_dict(sd)
     assert checkpointer._saved == true_checkpointer._saved
 
 
+@pytest.mark.parametrize(
+    "to_save",
+    [
+        {"model": DummyModel()},
+        {"model": [DummyModel(), DummyModel()]},
+        {"model": {"a": {"b": DummyModel()}}},
+    ],
+)
+def test_checkpoint__setup_checkpoint(to_save):
+    save_handler = MagicMock(spec=BaseSaveHandler)
+    checkpointer = Checkpoint(to_save, save_handler=save_handler, n_saved=2)
+    checkpoint = checkpointer._setup_checkpoint()
+
+    assert isinstance(checkpoint, dict)
+    for k, obj in to_save.items():
+        assert k in checkpoint
+        if isinstance(obj, torch.nn.Module):
+            assert checkpoint[k] == obj.state_dict()
+        elif isinstance(obj, list):
+            for c2, obj2 in zip(checkpoint[k], obj):
+                assert c2 == obj2.state_dict()
+        elif isinstance(obj, dict):
+            c2 = checkpoint[k]
+            for k2, obj2 in obj.items():
+                if isinstance(obj2, torch.nn.Module):
+                    assert c2[k2] == obj2.state_dict()
+                elif isinstance(obj2, dict):
+                    c3 = c2[k2]
+                    for k3, obj3 in obj2.items():
+                        assert c3[k3] == obj3.state_dict()
+
+
 def test_checkpoint_fixed_filename():
     model = DummyModel()
     to_save = {"model": model}
@@ -1783,7 +1827,6 @@ def score_function(_):
 
 
 def test_get_default_score_fn():
-
     with pytest.raises(ValueError, match=r"Argument score_sign should be 1 or -1"):
         Checkpoint.get_default_score_fn("acc", 2.0)
 
@@ -1823,7 +1866,6 @@ def test_load_single_object(obj_to_save, dirname):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.parametrize("atomic", [False, True])
 def test_disksaver_distrib(distributed_context_single_node_gloo, dirname, local_rank, atomic):
-
     saver = DiskSaver(dirname, atomic, save_on_rank=1)
     mocked_saver = MagicMock(wraps=saver)
 
diff --git a/tests/ignite/handlers/test_early_stopping.py b/tests/ignite/handlers/test_early_stopping.py
index 7382c7ec1b20..25fca8a9c468 100644
--- a/tests/ignite/handlers/test_early_stopping.py
+++ b/tests/ignite/handlers/test_early_stopping.py
@@ -13,7 +13,6 @@ def do_nothing_update_fn(engine, batch):
 
 
 def test_args_validation():
-
     trainer = Engine(do_nothing_update_fn)
 
     with pytest.raises(ValueError, match=r"Argument patience should be positive integer."):
@@ -30,7 +29,6 @@ def test_args_validation():
 
 
 def test_simple_early_stopping():
-
     scores = iter([1.0, 0.8, 0.88])
 
     def score_function(engine):
@@ -50,7 +48,6 @@ def score_function(engine):
 
 
 def test_state_dict():
-
     scores = iter([1.0, 0.8, 0.88])
 
     def score_function(engine):
@@ -75,7 +72,6 @@ def score_function(engine):
 
 
 def test_early_stopping_on_delta():
-
     scores = iter([1.0, 2.0, 2.01, 3.0, 3.01, 3.02])
 
     trainer = Engine(do_nothing_update_fn)
@@ -98,7 +94,6 @@ def test_early_stopping_on_delta():
 
 
 def test_early_stopping_on_last_event_delta():
-
     scores = iter([0.0, 0.3, 0.6])
 
     trainer = Engine(do_nothing_update_fn)
@@ -117,7 +112,6 @@ def test_early_stopping_on_last_event_delta():
 
 
 def test_early_stopping_on_cumulative_delta():
-
     scores = iter([0.0, 0.3, 0.6])
 
     trainer = Engine(do_nothing_update_fn)
@@ -151,7 +145,6 @@ def score_function(engine):
 
 
 def test_simple_no_early_stopping():
-
     scores = iter([1.0, 0.8, 1.2])
 
     def score_function(engine):
@@ -248,7 +241,6 @@ def evaluation(engine):
 
 
 def _test_distrib_with_engine_early_stopping(device):
-
     if device is None:
         device = idist.device()
     if isinstance(device, str):
@@ -287,7 +279,6 @@ def evaluation(engine):
 
 
 def _test_distrib_integration_engine_early_stopping(device):
-
     from ignite.metrics import Accuracy
 
     if device is None:
@@ -346,7 +337,6 @@ def evaluation(engine):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_with_engine_early_stopping(device)
     _test_distrib_integration_engine_early_stopping(device)
@@ -355,7 +345,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_with_engine_early_stopping(device)
     _test_distrib_integration_engine_early_stopping(device)
@@ -365,7 +354,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -377,7 +365,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_with_engine_early_stopping(device)
     _test_distrib_integration_engine_early_stopping(device)
@@ -387,7 +374,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_with_engine_early_stopping(device)
     _test_distrib_integration_engine_early_stopping(device)
diff --git a/tests/ignite/handlers/test_handlers.py b/tests/ignite/handlers/test_handlers.py
index f10b28e997b1..0357837fa65a 100644
--- a/tests/ignite/handlers/test_handlers.py
+++ b/tests/ignite/handlers/test_handlers.py
@@ -5,7 +5,6 @@
 
 
 def test_global_step_from_engine():
-
     iteration = 12
     epoch = 23
 
diff --git a/tests/ignite/handlers/test_lr_finder.py b/tests/ignite/handlers/test_lr_finder.py
index c966c8c3f1dd..159acd76eab5 100644
--- a/tests/ignite/handlers/test_lr_finder.py
+++ b/tests/ignite/handlers/test_lr_finder.py
@@ -159,7 +159,6 @@ def mnist_dataloader():
 
 
 def test_attach_incorrect_input_args(lr_finder, dummy_engine, model, optimizer, dataloader):
-
     with pytest.raises(TypeError, match=r"Argument to_save should be a mapping"):
         with lr_finder.attach(dummy_engine, to_save=123):
             pass
@@ -249,7 +248,6 @@ def test_with_attach(lr_finder, to_save, dummy_engine, dataloader):
 def test_wrong_values_start_lr_and_end_lr(
     lr_finder, dummy_engine, to_save, dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups
 ):
-
     with pytest.raises(ValueError, match=r"start_lr must be less than end_lr"):
         with lr_finder.attach(dummy_engine, to_save=to_save, start_lr=10.0, end_lr=1.0):
             pass
@@ -322,7 +320,6 @@ def assert_output_sizes(lr_finder, dummy_engine):
 
 
 def test_num_iter_is_none(lr_finder, to_save, dummy_engine, dataloader):
-
     with pytest.warns(UserWarning, match=r"Run completed without loss diverging"):
         with lr_finder.attach(dummy_engine, to_save=to_save, diverge_th=float("inf")) as trainer_with_finder:
             trainer_with_finder.run(dataloader)
@@ -331,7 +328,6 @@ def test_num_iter_is_none(lr_finder, to_save, dummy_engine, dataloader):
 
 
 def test_num_iter_is_enough(lr_finder, to_save, dummy_engine, dataloader):
-
     with pytest.warns(UserWarning, match=r"Run completed without loss diverging"):
         with lr_finder.attach(
             dummy_engine, to_save=to_save, num_iter=50, diverge_th=float("inf")
@@ -348,7 +344,7 @@ def test_num_iter_is_not_enough(lr_finder, to_save, dummy_engine, dataloader):
             trainer_with_finder.run(dataloader)
         assert_output_sizes(lr_finder, dummy_engine)
         assert dummy_engine.state.iteration != len(dataloader)
-        assert dummy_engine.state.iteration == 150
+        assert dummy_engine.state.iteration == 150 + 1
 
 
 def test_detach_terminates(lr_finder, to_save, dummy_engine, dataloader):
@@ -409,13 +405,13 @@ def test_engine_output_type(lr_finder, dummy_engine, optimizer):
     lr_finder._history = {"lr": [], "loss": []}
     lr_finder._log_lr_and_loss(dummy_engine, output_transform=lambda x: x, smooth_f=0, diverge_th=1)
     loss = lr_finder._history["loss"][-1]
-    assert type(loss) == float
+    assert type(loss) is float
 
     dummy_engine.state.output = torch.tensor([10.0], dtype=torch.float32)
     lr_finder._history = {"lr": [], "loss": []}
     lr_finder._log_lr_and_loss(dummy_engine, output_transform=lambda x: x, smooth_f=0, diverge_th=1)
     loss = lr_finder._history["loss"][-1]
-    assert type(loss) == float
+    assert type(loss) is float
 
 
 def test_lr_suggestion_unexpected_curve(lr_finder, to_save, dummy_engine, dataloader):
@@ -460,11 +456,9 @@ def test_lr_suggestion_multiple_param_groups(lr_finder):
 
 
 def test_lr_suggestion_mnist(lr_finder, mnist_to_save, dummy_engine_mnist, mnist_dataloader):
-
     max_iters = 50
 
     with lr_finder.attach(dummy_engine_mnist, mnist_to_save, diverge_th=2, step_mode="linear") as trainer_with_finder:
-
         with trainer_with_finder.add_event_handler(
             Events.ITERATION_COMPLETED(once=max_iters), lambda _: trainer_with_finder.terminate()
         ):
@@ -476,7 +470,6 @@ def test_lr_suggestion_mnist(lr_finder, mnist_to_save, dummy_engine_mnist, mnist
 def test_apply_suggested_lr_unmatched_optimizers(
     lr_finder, mnist_to_save, dummy_engine_mnist, optimizer_multiple_param_groups, mnist_dataloader
 ):
-
     with lr_finder.attach(dummy_engine_mnist, mnist_to_save) as trainer_with_finder:
         trainer_with_finder.run(mnist_dataloader)
 
@@ -489,7 +482,6 @@ def test_apply_suggested_lr_unmatched_optimizers(
 def test_apply_suggested_lr_single_param_groups(
     lr_finder, mnist_to_save, dummy_engine_mnist, mnist_optimizer, mnist_dataloader
 ):
-
     with lr_finder.attach(dummy_engine_mnist, mnist_to_save) as trainer_with_finder:
         trainer_with_finder.run(mnist_dataloader)
 
@@ -506,7 +498,6 @@ def test_apply_suggested_lr_multiple_param_groups(
     optimizer_multiple_param_groups,
     dataloader_plot,
 ):
-
     with lr_finder.attach(dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups) as trainer_with_finder:
         trainer_with_finder.run(dataloader_plot)
 
@@ -518,13 +509,11 @@ def test_apply_suggested_lr_multiple_param_groups(
 
 
 def test_no_matplotlib(no_site_packages, lr_finder):
-
     with pytest.raises(ModuleNotFoundError, match=r"This method requires matplotlib to be installed"):
         lr_finder.plot()
 
 
 def test_plot_single_param_group(dirname, lr_finder, mnist_to_save, dummy_engine_mnist, mnist_dataloader):
-
     with lr_finder.attach(dummy_engine_mnist, mnist_to_save, end_lr=20.0, smooth_f=0.04) as trainer_with_finder:
         trainer_with_finder.run(mnist_dataloader)
 
@@ -553,7 +542,6 @@ def _test(ax):
 def test_plot_multiple_param_groups(
     dirname, lr_finder, to_save_mulitple_param_groups, dummy_engine_mulitple_param_groups, dataloader_plot
 ):
-
     with lr_finder.attach(
         dummy_engine_mulitple_param_groups, to_save_mulitple_param_groups, end_lr=20.0, smooth_f=0.04
     ) as trainer_with_finder:
@@ -654,7 +642,6 @@ def forward(self, x):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_log_lr_and_loss(device)
     _test_distrib_integration_mnist(dirname, device)
@@ -664,7 +651,6 @@ def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_log_lr_and_loss(device)
     _test_distrib_integration_mnist(dirname, device)
diff --git a/tests/ignite/handlers/test_param_scheduler.py b/tests/ignite/handlers/test_param_scheduler.py
index fd123efeecf8..27348c9f1e67 100644
--- a/tests/ignite/handlers/test_param_scheduler.py
+++ b/tests/ignite/handlers/test_param_scheduler.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 import torch
-from torch.optim.lr_scheduler import ExponentialLR, StepLR
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ExponentialLR, StepLR
 
 from ignite.engine import Engine, Events
 from ignite.handlers.param_scheduler import (
@@ -36,7 +36,6 @@ def get_param(self):
 
 
 def test_param_scheduler_asserts():
-
     t1 = torch.zeros([1], requires_grad=True)
     t2 = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}])
@@ -56,8 +55,7 @@ def test_param_scheduler_asserts():
         FakeParamScheduler({}, "lr")
 
 
-def test_linear_scheduler():
-
+def test_linear_scheduler_asserts():
     with pytest.raises(TypeError, match=r"Argument optimizer should be torch.optim.Optimizer"):
         LinearCyclicalScheduler({}, "lr", 1, 0, cycle_size=0)
 
@@ -70,6 +68,11 @@ def test_linear_scheduler():
     with pytest.raises(ValueError, match=r"Argument cycle_size should be positive and larger than 1"):
         LinearCyclicalScheduler(optimizer, "lr", 1, 0, cycle_size=1)
 
+
+def test_linear_scheduler():
+    tensor = torch.zeros([1], requires_grad=True)
+    optimizer = torch.optim.SGD([tensor], lr=0.0)
+
     scheduler = LinearCyclicalScheduler(optimizer, "lr", 1, 0, 10)
     state_dict = scheduler.state_dict()
 
@@ -79,38 +82,12 @@ def save_lr(engine):
     trainer = Engine(lambda engine, batch: None)
     trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
     trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
-
+    lr_values_in_cycle = [1.0, 0.8, 0.6, 0.4, 0.2, 0.0, 0.2, 0.4, 0.6, 0.8]
     for _ in range(2):
         lrs = []
-        trainer.run([0] * 9, max_epochs=2)
+        trainer.run([0] * 10, max_epochs=2)
 
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    # Cycle 1
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.0,
-                    0.2,
-                    0.4,
-                    0.6,
-                    0.8,
-                    # Cycle 2
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.0,
-                    0.2,
-                    0.4,  # 0.6, 0.8,
-                ],
-            )
-        )
+        assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_cycle])
         scheduler.load_state_dict(state_dict)
 
     optimizer = torch.optim.SGD([tensor], lr=0)
@@ -166,49 +143,6 @@ def save_lr(engine):
         )
         scheduler.load_state_dict(state_dict)
 
-    # With float cycle_size
-    optimizer = torch.optim.SGD([tensor], lr=0)
-    scheduler = LinearCyclicalScheduler(
-        optimizer, "lr", start_value=1.2, end_value=0.2, cycle_size=10.00000012, cycle_mult=1.0
-    )
-    state_dict = scheduler.state_dict()
-
-    trainer = Engine(lambda engine, batch: None)
-    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
-    trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
-
-    for _ in range(2):
-        lrs = []
-        trainer.run([0] * 9, max_epochs=2)
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    # Cycle 1
-                    1.2,
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.4,
-                    0.6,
-                    0.8,
-                    1.0,
-                    # Cycle 2
-                    1.2,
-                    1.0,
-                    0.8,
-                    0.6,
-                    0.4,
-                    0.2,
-                    0.4,
-                    0.6,  # 0.8, 1.0,
-                ],
-            )
-        )
-        scheduler.load_state_dict(state_dict)
-
 
 def test_linear_scheduler_cycle_size_two():
     tensor = torch.zeros([1], requires_grad=True)
@@ -241,17 +175,23 @@ def save_lr(engine):
     assert lrs == pytest.approx([v for i, v in simulated_values])
 
 
-def test_cosine_annealing_scheduler():
+@pytest.mark.parametrize("cyclic_warmup", [False, True])
+def test_cosine_annealing_scheduler(cyclic_warmup):
     tensor = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([tensor], lr=0)
 
-    scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10)
+    scheduler = CosineAnnealingScheduler(optimizer, "lr", 0, 1, 10, warmup_duration=2 if cyclic_warmup else 0)
     state_dict = scheduler.state_dict()
 
-    data = [0] * 9
+    data = [0] * (10 + int(cyclic_warmup))
     max_epochs = 2
     simulated_values = CosineAnnealingScheduler.simulate_values(
-        num_events=len(data) * max_epochs, param_name="lr", start_value=0, end_value=1, cycle_size=10
+        num_events=len(data) * max_epochs,
+        param_name="lr",
+        start_value=0,
+        end_value=1,
+        cycle_size=10,
+        warmup_duration=2 if cyclic_warmup else 0,
     )
 
     def save_lr(engine):
@@ -260,43 +200,31 @@ def save_lr(engine):
     trainer = Engine(lambda engine, batch: None)
     trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
     trainer.add_event_handler(Events.ITERATION_COMPLETED, save_lr)
+    lr_values_in_cycle = [
+        0.0,
+        0.02447174185242318,
+        0.09549150281252627,
+        0.20610737385376332,
+        0.3454915028125263,
+        0.5,
+        0.6545084971874737,
+        0.7938926261462365,
+        0.9045084971874737,
+        0.9755282581475768,
+    ]
+    lr_values_in_warmup = np.linspace(1.0, 0.0, 2 + 1)[:-1].tolist() if cyclic_warmup else []
 
     for _ in range(2):
         lrs = []
         trainer.run(data, max_epochs=max_epochs)
 
-        assert lrs == list(
-            map(
-                pytest.approx,
-                [
-                    0.0,
-                    0.02447174185242318,
-                    0.09549150281252627,
-                    0.20610737385376332,
-                    0.3454915028125263,
-                    0.5,
-                    0.6545084971874737,
-                    0.7938926261462365,
-                    0.9045084971874737,
-                    0.9755282581475768,
-                    0.0,
-                    0.02447174185242318,
-                    0.09549150281252627,
-                    0.20610737385376332,
-                    0.3454915028125263,
-                    0.5,
-                    0.6545084971874737,
-                    0.7938926261462365,  # 0.9045084971874737, 0.9755282581475768
-                ],
-            )
-        )
+        assert lrs == pytest.approx([*lr_values_in_cycle, *lr_values_in_warmup, *lr_values_in_cycle])
         scheduler.load_state_dict(state_dict)
 
         assert lrs == pytest.approx([v for i, v in simulated_values])
 
 
 def test_concat_scheduler_asserts():
-
     tensor = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([tensor], lr=0)
 
@@ -620,7 +548,6 @@ def save_lr(engine):
 
 
 def test_lr_scheduler_asserts():
-
     err_msg = r"Argument lr_scheduler should be a subclass of torch.optim.lr_scheduler.(_LRScheduler|LRScheduler)"
     with pytest.raises(TypeError, match=err_msg):
         LRScheduler(123)
@@ -638,7 +565,6 @@ def test_lr_scheduler_asserts():
     ],
 )
 def test_lr_scheduler(torch_lr_scheduler_cls, kwargs):
-
     if torch_lr_scheduler_cls is None:
         return
 
@@ -655,7 +581,7 @@ def test_lr_scheduler(torch_lr_scheduler_cls, kwargs):
     state_dict1 = scheduler1.state_dict()
 
     torch_lr_scheduler2 = torch_lr_scheduler_cls(optimizer=optimizer2, **kwargs)
-    with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it is will be skipped"):
+    with pytest.warns(UserWarning, match=r"the first lr value from the optimizer, otherwise it will be skipped"):
         scheduler2 = LRScheduler(torch_lr_scheduler2, use_legacy=True)
     state_dict2 = scheduler2.state_dict()
 
@@ -713,7 +639,6 @@ def torch_lr_scheduler_step(engine):
 
 
 def test_piecewiselinear_asserts():
-
     tensor = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([tensor], lr=0)
 
@@ -738,7 +663,6 @@ def test_piecewiselinear_asserts():
 
 @pytest.mark.parametrize("milestones_as_np_int", [True, False])
 def test_piecewiselinear(milestones_as_np_int):
-
     tensor = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([tensor], lr=0)
 
@@ -821,13 +745,11 @@ def save_lr(engine):
 
 
 def test_simulate_and_plot_values():
-
     import matplotlib
 
     matplotlib.use("Agg")
 
     def _test(scheduler_cls, **scheduler_kwargs):
-
         if scheduler_cls == LRScheduler:
             optimizer = scheduler_kwargs["lr_scheduler"].optimizer
         elif scheduler_cls == ConcatScheduler:
@@ -914,7 +836,6 @@ def save_lr(engine):
 
 
 def test_create_lr_scheduler_with_warmup_asserts():
-
     with pytest.raises(TypeError, match=r"Argument lr_scheduler should be a subclass of"):
         create_lr_scheduler_with_warmup(12, warmup_start_value=0.0, warmup_end_value=0.1, warmup_duration=10)
 
@@ -966,7 +887,6 @@ def test_create_lr_scheduler_with_warmup_asserts():
 def test_create_lr_scheduler_with_warmup(
     lr_scheduler_name, warmup_start_value, warmup_end_value, warmup_duration, warmup_end_next_value
 ):
-
     t1 = torch.zeros([1], requires_grad=True)
 
     if lr_scheduler_name == "ExponentialLR":
@@ -1091,7 +1011,6 @@ def save_lr(engine):
 
 
 def test_create_lr_scheduler_with_warmup_with_real_model(dummy_model_factory):
-
     model = dummy_model_factory(with_grads=False, with_frozen_layer=False)
     init_lr = 0.01
     optimizer = torch.optim.SGD(model.parameters(), lr=init_lr)
@@ -1118,7 +1037,6 @@ def test_create_lr_scheduler_with_warmup_with_real_model(dummy_model_factory):
 
 
 def test_param_group_scheduler_asserts():
-
     t1 = torch.zeros([1], requires_grad=True)
     t2 = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}])
@@ -1169,7 +1087,6 @@ def test_param_group_scheduler_asserts():
 
 @pytest.mark.parametrize("param_groups_setting", ["single_optim", "multi_optim"])
 def test_param_group_scheduler(param_groups_setting):
-
     t1 = torch.zeros([1], requires_grad=True)
     t2 = torch.zeros([1], requires_grad=True)
     if param_groups_setting == "single_optim":
@@ -1234,7 +1151,6 @@ def save_lr(_, lrs):
     ],
 )
 def test_scheduler_with_param_groups(scheduler_cls, kwargs):
-
     t1 = torch.zeros([1], requires_grad=True)
     t2 = torch.zeros([1], requires_grad=True)
     optimizer = torch.optim.SGD([{"params": t1, "lr": 0.1}, {"params": t2, "lr": 0.1}])
@@ -1377,3 +1293,45 @@ def test_reduce_lr_on_plateau_scheduler_asserts():
     with pytest.raises(ValueError, match=r"Length of argument metric_values should be equal to num_events."):
         metric_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
         ReduceLROnPlateauScheduler.simulate_values(5, metric_values, 0.01)
+
+
+@pytest.mark.parametrize("warmup_end_value", [0.23, None])
+@pytest.mark.parametrize("T_0", [1, 12])
+@pytest.mark.parametrize("T_mult", [1, 3])
+def test_create_lr_scheduler_with_warmup_cosine(warmup_end_value, T_0, T_mult):
+    lr = 0.2
+    steps = 200
+    warm_steps = 50
+    warm_start = 0.023
+
+    def get_optim():
+        t1 = torch.zeros([1], requires_grad=True)
+        return torch.optim.SGD([t1], lr=lr)
+
+    def get_cos_shed():
+        return CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=T_mult)
+
+    optimizer = get_optim()
+    scheduler = get_cos_shed()
+    cosine_lrs = []
+    for i in range(steps):
+        cosine_lrs.append(optimizer.param_groups[0]["lr"])
+        scheduler.step()
+
+    optimizer = get_optim()
+    scheduler = create_lr_scheduler_with_warmup(
+        get_cos_shed(), warmup_start_value=warm_start, warmup_end_value=warmup_end_value, warmup_duration=warm_steps
+    )
+
+    warm_lrs = []
+    real_warm_steps = warm_steps if warmup_end_value is not None else (warm_steps - 1)
+    for epoch in range(real_warm_steps + steps):
+        scheduler(None)
+        warm_lrs.append(optimizer.param_groups[0]["lr"])
+
+    if warmup_end_value is not None:
+        np.testing.assert_allclose(np.linspace(warm_start, warmup_end_value, warm_steps), warm_lrs[:warm_steps])
+        assert warm_lrs[real_warm_steps:] == cosine_lrs
+    else:
+        np.testing.assert_allclose(np.linspace(warm_start, lr, warm_steps), warm_lrs[:warm_steps])
+        assert warm_lrs[real_warm_steps:] == cosine_lrs
diff --git a/tests/ignite/handlers/test_state_param_scheduler.py b/tests/ignite/handlers/test_state_param_scheduler.py
index b249136e47c5..b907683d7e0d 100644
--- a/tests/ignite/handlers/test_state_param_scheduler.py
+++ b/tests/ignite/handlers/test_state_param_scheduler.py
@@ -141,7 +141,6 @@ def test_pwlinear_scheduler_max_value(max_epochs, milestones_values):
 
 
 def test_piecewiselinear_asserts():
-
     with pytest.raises(TypeError, match=r"Argument milestones_values should be a list or tuple"):
         PiecewiseLinearStateScheduler(param_name="linear_scheduled_param", milestones_values=None)
 
@@ -169,7 +168,7 @@ def test_exponential_scheduler(max_epochs, initial_value, gamma):
     )
     exp_state_parameter_scheduler.attach(engine, Events.EPOCH_COMPLETED)
     engine.run([0] * 8, max_epochs=max_epochs)
-    torch_testing_assert_close(getattr(engine.state, "exp_scheduled_param"), initial_value * gamma ** max_epochs)
+    torch_testing_assert_close(getattr(engine.state, "exp_scheduled_param"), initial_value * gamma**max_epochs)
 
     state_dict = exp_state_parameter_scheduler.state_dict()
     exp_state_parameter_scheduler.load_state_dict(state_dict)
@@ -222,7 +221,6 @@ def test_multistep_scheduler(max_epochs, initial_value, gamma, milestones):
 
 
 def test_custom_scheduler():
-
     engine = Engine(lambda e, b: None)
 
     class LambdaState:
@@ -264,7 +262,6 @@ def __init__(self, initial_value, gamma):
 
 @pytest.mark.parametrize("scheduler_cls, scheduler_kwargs", [config3, config4, config5, config6])
 def test_simulate_and_plot_values(scheduler_cls, scheduler_kwargs):
-
     import matplotlib
 
     matplotlib.use("Agg")
@@ -285,7 +282,6 @@ def test_simulate_and_plot_values(scheduler_cls, scheduler_kwargs):
 @pytest.mark.parametrize("save_history", [False, True])
 @pytest.mark.parametrize("scheduler_cls, scheduler_kwargs", [config3, config4, config5, config6])
 def test_simulate_values(scheduler_cls, scheduler_kwargs, save_history):
-
     max_epochs = 2
     data = [0] * 10
     scheduler_kwargs["save_history"] = save_history
@@ -293,7 +289,6 @@ def test_simulate_values(scheduler_cls, scheduler_kwargs, save_history):
 
 
 def test_torch_save_load(dirname):
-
     lambda_state_parameter_scheduler = LambdaStateScheduler(
         param_name="custom_scheduled_param", lambda_obj=LambdaState(initial_value=10, gamma=0.99), create_new=True
     )
@@ -321,7 +316,6 @@ def test_torch_save_load(dirname):
 
 
 def test_simulate_and_plot_values_no_matplotlib():
-
     with pytest.raises(ModuleNotFoundError, match=r"This method requires matplotlib to be installed."):
         with patch.dict("sys.modules", {"matplotlib.pyplot": None}):
             event = Events.EPOCH_COMPLETED
@@ -477,7 +471,6 @@ def test_param_scheduler_attach_warning():
 
 
 def test_param_scheduler_with_ema_handler():
-
     from ignite.handlers import EMAHandler
 
     model = nn.Linear(2, 1)
diff --git a/tests/ignite/handlers/test_terminate_on_nan.py b/tests/ignite/handlers/test_terminate_on_nan.py
index e231a2b48f41..c7db4745e57f 100644
--- a/tests/ignite/handlers/test_terminate_on_nan.py
+++ b/tests/ignite/handlers/test_terminate_on_nan.py
@@ -23,7 +23,6 @@
     ],
 )
 def test_terminate_on_nan_and_inf(state_output, should_terminate):
-
     torch.manual_seed(12)
 
     def update_fn(engine, batch):
@@ -41,7 +40,6 @@ def update_fn(engine, batch):
 
 
 def test_with_terminate_on_nan():
-
     torch.manual_seed(12)
 
     data = [1.0, 0.8, (torch.rand(4, 4), torch.rand(4, 4)), torch.rand(5), torch.asin(torch.randn(4, 4)), 0.0, 1.0]
@@ -58,7 +56,6 @@ def update_fn(engine, batch):
 
 
 def test_with_terminate_on_inf():
-
     torch.manual_seed(12)
 
     data = [
@@ -84,7 +81,6 @@ def update_fn(engine, batch):
 
 
 def test_without_terminate_on_nan_inf():
-
     data = [1.0, 0.8, torch.rand(4, 4), (torch.rand(5), torch.rand(5, 4)), 0.0, 1.0]
 
     def update_fn(engine, batch):
diff --git a/tests/ignite/handlers/test_time_limit.py b/tests/ignite/handlers/test_time_limit.py
index d0d9c1889b84..d82a426d259f 100644
--- a/tests/ignite/handlers/test_time_limit.py
+++ b/tests/ignite/handlers/test_time_limit.py
@@ -7,7 +7,6 @@
 
 
 def test_arg_validation():
-
     with pytest.raises(ValueError, match=r"Argument limit_sec should be a positive integer."):
         TimeLimit(limit_sec=-5)
 
diff --git a/tests/ignite/handlers/test_time_profilers.py b/tests/ignite/handlers/test_time_profilers.py
index 978f193ccaae..7029f7e99b52 100644
--- a/tests/ignite/handlers/test_time_profilers.py
+++ b/tests/ignite/handlers/test_time_profilers.py
@@ -830,7 +830,6 @@ def test_write_results_handlers_profiler(dirname):
 
 
 def test_print_results_basic_profiler(capsys):
-
     true_max_epochs = 1
     true_num_iters = 5
 
@@ -848,7 +847,6 @@ def test_print_results_basic_profiler(capsys):
 
 
 def test_print_results_handlers_profiler_handlers_profiler(capsys):
-
     true_max_epochs = 1
     true_num_iters = 5
 
diff --git a/tests/ignite/metrics/gan/test_fid.py b/tests/ignite/metrics/gan/test_fid.py
index de0b379d24cd..0da5b574340e 100644
--- a/tests/ignite/metrics/gan/test_fid.py
+++ b/tests/ignite/metrics/gan/test_fid.py
@@ -19,7 +19,6 @@ def mock_no_scipy():
 
 
 def test_no_scipy(mock_no_scipy):
-
     with pytest.raises(ModuleNotFoundError, match=r"This module requires scipy to be installed."):
         FID()
 
@@ -34,7 +33,6 @@ def mock_no_numpy():
 
 
 def test_no_numpy(mock_no_numpy):
-
     with pytest.raises(ModuleNotFoundError, match=r"This module requires numpy to be installed."):
         FID()
 
@@ -105,7 +103,6 @@ def test_compute_fid_sqrtm():
 
 
 def test_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero"):
         FID(num_features=-1, feature_extractor=torch.nn.Identity())
 
@@ -156,7 +153,6 @@ def test_statistics():
 
 
 def _test_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -218,7 +214,6 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
diff --git a/tests/ignite/metrics/gan/test_inception_score.py b/tests/ignite/metrics/gan/test_inception_score.py
index 98e2f7ecd871..cb25ae3608d2 100644
--- a/tests/ignite/metrics/gan/test_inception_score.py
+++ b/tests/ignite/metrics/gan/test_inception_score.py
@@ -9,7 +9,6 @@
 
 
 def calculate_inception_score(p_yx):
-
     p_y = torch.unsqueeze(p_yx.mean(axis=0), 0)
     kl_d = torch.kl_div(torch.log(p_y), p_yx)
 
@@ -22,7 +21,6 @@ def calculate_inception_score(p_yx):
 
 
 def test_inception_score():
-
     p_yx = torch.rand(20, 10)
     m = InceptionScore(num_features=10, feature_extractor=torch.nn.Identity())
     m.update(p_yx)
@@ -43,7 +41,6 @@ def test_device_mismatch_cuda():
 
 
 def test_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero, got:"):
         InceptionScore(num_features=-1, feature_extractor=torch.nn.Identity()).update(torch.rand(2, 0))
 
@@ -66,7 +63,6 @@ def test_wrong_inputs():
 
 
 def _test_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -119,7 +115,6 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
diff --git a/tests/ignite/metrics/gan/test_utils.py b/tests/ignite/metrics/gan/test_utils.py
index 50871acb6a79..534865fdc094 100644
--- a/tests/ignite/metrics/gan/test_utils.py
+++ b/tests/ignite/metrics/gan/test_utils.py
@@ -34,7 +34,6 @@ def update(self, output):
 
 
 def test_dummy_metric():
-
     with pytest.raises(ValueError, match=r"Argument num_features must be greater to zero, got:"):
         DummyInceptionMetric(num_features=-1, feature_extractor=torch.nn.Identity()).update(torch.rand(2, 0))
 
@@ -57,7 +56,6 @@ def test_dummy_metric():
 
 
 def test_inception_extractor_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"Inputs should be a tensor of dim 4"):
         InceptionModel(return_features=True)(torch.rand(2))
 
diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py
index ff173ed66d0b..9de9c6de78c5 100644
--- a/tests/ignite/metrics/nlp/test_bleu.py
+++ b/tests/ignite/metrics/nlp/test_bleu.py
@@ -16,7 +16,6 @@
 
 
 def test_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         Bleu(ngram=0)
 
@@ -200,7 +199,6 @@ def test_n_gram_counter(candidates, references):
 
 
 def _test_macro_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -243,7 +241,6 @@ def _test(metric_device):
 
 
 def _test_micro_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -293,7 +290,6 @@ def _test(metric_device):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_macro_distrib_integration(device)
     _test_micro_distrib_integration(device)
@@ -302,7 +298,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_macro_distrib_integration(device)
     _test_micro_distrib_integration(device)
@@ -312,7 +307,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -324,7 +318,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_macro_distrib_integration(device)
     _test_micro_distrib_integration(device)
@@ -334,7 +327,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_macro_distrib_integration(device)
     _test_micro_distrib_integration(device)
diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
index b798c5e3bbe5..c2fb75051829 100644
--- a/tests/ignite/metrics/nlp/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -38,7 +38,6 @@ def test_compute_ngram_scores(candidate, reference, n, expected_precision, expec
 
 
 def test_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"ngram order must be greater than zero"):
         RougeN(ngram=0)
 
@@ -120,7 +119,6 @@ def test_rouge_metrics(candidates, references):
 
 
 def _test_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -175,7 +173,6 @@ def _test(metric_device):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -183,7 +180,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -192,7 +188,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -203,7 +198,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -212,7 +206,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
 
diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py
index ce26cd3dc6ec..d4551721ee0e 100644
--- a/tests/ignite/metrics/test_accumulation.py
+++ b/tests/ignite/metrics/test_accumulation.py
@@ -15,7 +15,6 @@
 
 
 def test_variable_accumulation_wrong_inputs():
-
     with pytest.raises(TypeError, match=r"Argument op should be a callable"):
         VariableAccumulation(1)
 
@@ -29,7 +28,6 @@ def test_variable_accumulation_wrong_inputs():
 
 
 def test_variable_accumulation_mean_variable():
-
     mean_var = VariableAccumulation(lambda a, x: a + x)
     y_true = torch.rand(100)
 
@@ -61,7 +59,6 @@ def test_variable_accumulation_mean_variable():
 
 
 def test_average():
-
     with pytest.raises(NotComputableError):
         v = Average()
         v.compute()
@@ -102,7 +99,6 @@ def _mean(y_true):
 
 
 def test_geom_average():
-
     with pytest.raises(NotComputableError):
         v = GeometricAverage()
         v.compute()
@@ -136,13 +132,11 @@ def test_geom_average():
 @pytest.mark.parametrize("metric_cls, true_result_fn", [(Average, _mean), (GeometricAverage, _geom_mean)])
 @pytest.mark.parametrize("shape", [[100, 12], [100]])
 def test_integration(metric_cls, true_result_fn, shape):
-
     assert len(shape) > 0 and len(shape) < 3
 
     custom_variable = 10.0 + 5.0 * torch.rand(shape)
 
     def update_fn(engine, batch):
-
         output = custom_variable[engine.state.iteration - 1]
         output = output.item() if output.ndimension() < 1 else output
         return 0, output
@@ -158,6 +152,16 @@ def update_fn(engine, batch):
         np.array(state.metrics["agg_custom_var"]), true_result_fn(custom_variable), decimal=5
     )
 
+    metric_state = custom_var_mean.state_dict()
+    saved_num_examples = custom_var_mean.num_examples
+    saved_accumulator = custom_var_mean.accumulator
+    custom_var_mean.reset()
+    assert custom_var_mean.num_examples == 0
+    assert custom_var_mean.accumulator == 0
+    custom_var_mean.load_state_dict(metric_state)
+    assert custom_var_mean.num_examples == saved_num_examples
+    assert (custom_var_mean.accumulator == saved_accumulator).all()
+
 
 def test_compute_mean_std():
     n = 8
@@ -174,7 +178,7 @@ def compute_mean_std(engine, batch):
         _b, _c = batch.shape[:2]
         data = batch.reshape(_b, _c, -1).to(dtype=torch.float64)
         _mean = torch.mean(data, dim=-1)
-        _mean2 = torch.mean(data ** 2, dim=-1)
+        _mean2 = torch.mean(data**2, dim=-1)
         return {"mean": _mean, "mean^2": _mean2}
 
     compute_engine = Engine(compute_mean_std)
@@ -327,7 +331,6 @@ def _dist_geom_mean(y_true):
 
 def _test_distrib_integration(device):
     def _test(metric_cls, shape, true_result_fn, metric_device, tol=1e-5):
-
         size = 100
         custom_variable = 10.0 + 5.0 * torch.rand(size, *shape, dtype=torch.float64)
         custom_variable = custom_variable.to(device)
@@ -373,12 +376,10 @@ def update_fn(engine, batch):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         m = VariableAccumulation(lambda a, x: x, device=metric_device)
         assert m._device == metric_device
         assert (
@@ -423,7 +424,6 @@ def _test_apex_average(device, amp_mode, opt_level):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_variable_accumulation(device)
     _test_distrib_average(device)
@@ -435,7 +435,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_variable_accumulation(device)
     _test_distrib_average(device)
@@ -448,7 +447,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = idist.device()
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -463,7 +461,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_variable_accumulation(device)
     _test_distrib_average(device)
@@ -505,7 +502,6 @@ def test_apex_average_on_cuda():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_variable_accumulation(device)
     _test_distrib_average(device)
@@ -518,7 +514,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_variable_accumulation(device)
     _test_distrib_average(device)
diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py
index de827b4b0733..0b7fee3bf605 100644
--- a/tests/ignite/metrics/test_accuracy.py
+++ b/tests/ignite/metrics/test_accuracy.py
@@ -362,7 +362,6 @@ def _test(metric_device):
 
 
 def _test_distrib_integration_multiclass(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -406,6 +405,14 @@ def update(engine, i):
 
         assert pytest.approx(res) == true_res
 
+        metric_state = acc.state_dict()
+        saved__num_correct = acc._num_correct
+        saved__num_examples = acc._num_examples
+        acc.reset()
+        acc.load_state_dict(metric_state)
+        assert acc._num_examples == saved__num_examples
+        assert (acc._num_correct == saved__num_correct).all()
+
     metric_devices = ["cpu"]
     if device.type != "xla":
         metric_devices.append(idist.device())
@@ -416,7 +423,6 @@ def update(engine, i):
 
 
 def _test_distrib_integration_multilabel(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -470,12 +476,10 @@ def update(engine, i):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         acc = Accuracy(device=metric_device)
         assert acc._device == metric_device
         assert (
@@ -492,7 +496,6 @@ def _test_distrib_accumulator_device(device):
 
 
 def _test_distrib_integration_list_of_tensors_or_numbers(device):
-
     rank = idist.get_rank()
 
     def _test(n_epochs, metric_device):
@@ -548,7 +551,6 @@ def update(_, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
@@ -560,7 +562,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
@@ -573,7 +574,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -617,7 +617,6 @@ def test_distrib_xla_nprocs(xmp_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
@@ -630,7 +629,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_multilabel_input_NHW(device)
     _test_distrib_integration_multiclass(device)
diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py
index aa2238824bc4..b132daf13304 100644
--- a/tests/ignite/metrics/test_classification_report.py
+++ b/tests/ignite/metrics/test_classification_report.py
@@ -10,11 +10,9 @@
 
 
 def _test_integration_multiclass(device, output_dict):
-
     rank = idist.get_rank()
 
     def _test(metric_device, n_classes, labels=None):
-
         classification_report = ClassificationReport(device=metric_device, output_dict=output_dict, labels=labels)
         n_iters = 80
         batch_size = 16
@@ -62,6 +60,22 @@ def update(engine, i):
         assert sklearn_result["macro avg"]["recall"] == pytest.approx(res["macro avg"]["recall"])
         assert sklearn_result["macro avg"]["f1-score"] == pytest.approx(res["macro avg"]["f1-score"])
 
+        metric_state = classification_report.state_dict()
+        classification_report.reset()
+        classification_report.load_state_dict(metric_state)
+        res2 = classification_report.compute()
+        if not output_dict:
+            res2 = json.loads(res2)
+
+        for i in range(n_classes):
+            label_i = labels[i] if labels else str(i)
+            assert res2[label_i]["precision"] == res[label_i]["precision"]
+            assert res2[label_i]["f1-score"] == res[label_i]["f1-score"]
+            assert res2[label_i]["recall"] == res[label_i]["recall"]
+        assert res2["macro avg"]["precision"] == res["macro avg"]["precision"]
+        assert res2["macro avg"]["recall"] == res["macro avg"]["recall"]
+        assert res2["macro avg"]["f1-score"] == res["macro avg"]["f1-score"]
+
     for i in range(5):
         torch.manual_seed(12 + rank + i)
         # check multiple random inputs as random exact occurencies are rare
@@ -78,11 +92,9 @@ def update(engine, i):
 
 
 def _test_integration_multilabel(device, output_dict):
-
     rank = idist.get_rank()
 
     def _test(metric_device, n_epochs, labels=None):
-
         classification_report = ClassificationReport(device=metric_device, output_dict=output_dict, is_multilabel=True)
 
         n_iters = 10
@@ -125,7 +137,6 @@ def update(engine, i):
         sklearn_result = sklearn_classification_report(np_y_true, np_y_preds, output_dict=True, zero_division=1)
 
         for i in range(n_classes):
-            torch.manual_seed(12 + rank + i)
             label_i = labels[i] if labels else str(i)
             assert sklearn_result[str(i)]["precision"] == pytest.approx(res[label_i]["precision"])
             assert sklearn_result[str(i)]["f1-score"] == pytest.approx(res[label_i]["f1-score"])
@@ -134,7 +145,8 @@ def update(engine, i):
         assert sklearn_result["macro avg"]["recall"] == pytest.approx(res["macro avg"]["recall"])
         assert sklearn_result["macro avg"]["f1-score"] == pytest.approx(res["macro avg"]["f1-score"])
 
-    for _ in range(3):
+    for i in range(3):
+        torch.manual_seed(12 + rank + i)
         # check multiple random inputs as random exact occurencies are rare
         metric_devices = ["cpu"]
         if device.type != "xla":
@@ -150,7 +162,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_integration_multiclass(device, True)
     _test_integration_multiclass(device, False)
@@ -161,7 +172,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_integration_multiclass(device, True)
     _test_integration_multiclass(device, False)
@@ -173,7 +183,6 @@ def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_glo
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -184,7 +193,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 
 
 def _test_distrib_xla_nprocs(index):
-
     device = idist.device()
     _test_integration_multiclass(device, True)
     _test_integration_multiclass(device, False)
@@ -212,7 +220,6 @@ def to_numpy_multilabel(y):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_integration_multiclass(device, True)
     _test_integration_multiclass(device, False)
@@ -224,7 +231,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_integration_multiclass(device, True)
     _test_integration_multiclass(device, False)
diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py
index 2b777126fdab..ddb4509567d7 100644
--- a/tests/ignite/metrics/test_confusion_matrix.py
+++ b/tests/ignite/metrics/test_confusion_matrix.py
@@ -182,7 +182,6 @@ def test_multiclass_images():
 
 
 def test_iou_wrong_input():
-
     with pytest.raises(TypeError, match="Argument cm should be instance of ConfusionMatrix"):
         IoU(None)
 
@@ -202,7 +201,6 @@ def test_iou_wrong_input():
 
 @pytest.mark.parametrize("average", [None, "samples"])
 def test_iou(average):
-
     y_true, y_pred = get_y_true_y_pred()
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -241,7 +239,6 @@ def test_iou(average):
 
 
 def test_miou():
-
     y_true, y_pred = get_y_true_y_pred()
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -278,7 +275,6 @@ def test_miou():
 
 
 def test_cm_accuracy():
-
     y_true, y_pred = get_y_true_y_pred()
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -297,7 +293,6 @@ def test_cm_accuracy():
 
 
 def test_cm_precision():
-
     y_true, y_pred = np.random.randint(0, 10, size=(1000,)), np.random.randint(0, 10, size=(1000,))
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -328,7 +323,6 @@ def test_cm_precision():
 
 
 def test_cm_recall():
-
     y_true, y_pred = np.random.randint(0, 10, size=(1000,)), np.random.randint(0, 10, size=(1000,))
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -393,7 +387,6 @@ def test_cm_with_average():
 
 
 def test_dice_coefficient_wrong_input():
-
     with pytest.raises(TypeError, match="Argument cm should be instance of ConfusionMatrix"):
         DiceCoefficient(None)
 
@@ -412,7 +405,6 @@ def test_dice_coefficient_wrong_input():
 
 
 def test_dice_coefficient():
-
     y_true, y_pred = get_y_true_y_pred()
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -517,12 +509,10 @@ def _test(metric_device):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         cm = ConfusionMatrix(num_classes=3, device=metric_device)
         assert cm._device == metric_device
         assert (
@@ -540,7 +530,6 @@ def _test_distrib_accumulator_device(device):
 
 @pytest.mark.parametrize("average", [None, "samples"])
 def test_jaccard_index(average):
-
     y_true, y_pred = get_y_true_y_pred()
     th_y_true, th_y_logits = compute_th_y_true_y_logits(y_true, y_pred)
 
@@ -582,7 +571,6 @@ def test_jaccard_index(average):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_multiclass_images(device)
     _test_distrib_accumulator_device(device)
@@ -591,7 +579,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_multiclass_images(device)
     _test_distrib_accumulator_device(device)
@@ -601,7 +588,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -636,7 +622,6 @@ def test_distrib_xla_nprocs(xmp_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_multiclass_images(device)
     _test_distrib_accumulator_device(device)
@@ -646,7 +631,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_multiclass_images(device)
     _test_distrib_accumulator_device(device)
diff --git a/tests/ignite/metrics/test_epoch_metric.py b/tests/ignite/metrics/test_epoch_metric.py
index dc4d6b4cd6c0..d82168266b1d 100644
--- a/tests/ignite/metrics/test_epoch_metric.py
+++ b/tests/ignite/metrics/test_epoch_metric.py
@@ -8,7 +8,6 @@
 
 
 def test_epoch_metric_wrong_setup_or_input():
-
     # Wrong compute function
     with pytest.raises(TypeError, match=r"Argument compute_fn should be callable."):
         EpochMetric(12345)
@@ -152,7 +151,6 @@ def compute_fn(y_preds, y_targets):
 
 
 def test_distrib_integration(distributed):
-
     device = idist.device() if idist.device().type != "xla" else "cpu"
     rank = idist.get_rank()
     torch.manual_seed(40 + rank)
diff --git a/tests/ignite/metrics/test_fbeta.py b/tests/ignite/metrics/test_fbeta.py
index c6301470d53c..62a793f4f53c 100644
--- a/tests/ignite/metrics/test_fbeta.py
+++ b/tests/ignite/metrics/test_fbeta.py
@@ -13,7 +13,6 @@
 
 
 def test_wrong_inputs():
-
     with pytest.raises(ValueError, match=r"Beta should be a positive integer"):
         Fbeta(0.0)
 
@@ -50,7 +49,6 @@ def _output_transform(output):
     ],
 )
 def test_integration(p, r, average, output_transform):
-
     np.random.seed(1)
 
     n_iters = 10
@@ -89,7 +87,6 @@ def update_fn(engine, batch):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     def _test(p, r, average, n_epochs, metric_device):
@@ -149,7 +146,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -157,7 +153,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -166,7 +161,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -198,7 +192,6 @@ def test_distrib_xla_nprocs(xmp_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -207,6 +200,5 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
diff --git a/tests/ignite/metrics/test_frequency.py b/tests/ignite/metrics/test_frequency.py
index f1d0d2666bf2..ee053997ebf6 100644
--- a/tests/ignite/metrics/test_frequency.py
+++ b/tests/ignite/metrics/test_frequency.py
@@ -28,7 +28,6 @@ def test_nondistributed_average():
 
 
 def _test_frequency_with_engine(workers=None, lower_bound_factor=0.8, upper_bound_factor=1.1, every=1):
-
     if workers is None:
         workers = idist.get_world_size()
 
@@ -90,7 +89,6 @@ def test_frequency_with_engine_distributed_with_every(distributed_context_single
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
     gloo_hvd_executor(_test_frequency_with_engine, (None, 0.8, 1), np=nproc, do_init=True)
diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py
index cc369371c10c..19cc68cd45cc 100644
--- a/tests/ignite/metrics/test_loss.py
+++ b/tests/ignite/metrics/test_loss.py
@@ -26,7 +26,6 @@ def compute(self):
         pass
 
     def update(self, output):
-
         assert output == self.true_output
 
 
@@ -176,7 +175,6 @@ def _test(metric_device, y_test_1, y_test_2):
 
 
 def _test_distrib_accumulator_device(device, y_test_1):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
@@ -208,7 +206,6 @@ def test_sum_detached():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2())
     _test_distrib_accumulator_device(device, y_test_1())
@@ -217,7 +214,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2())
     _test_distrib_accumulator_device(device, y_test_1())
@@ -227,7 +223,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -239,14 +234,12 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2())
     _test_distrib_accumulator_device(device, y_test_1())
 
 
 def _test_distrib_xla_nprocs(index):
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2())
     _test_distrib_accumulator_device(device, y_test_1())
@@ -264,7 +257,6 @@ def test_distrib_xla_nprocs(xmp_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6)
     _test_distrib_accumulator_device(device, y_test_1())
@@ -274,7 +266,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2())
     _test_distrib_accumulator_device(device, y_test_1())
diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py
index 6ab3c1b9080c..ab9b0da8810f 100644
--- a/tests/ignite/metrics/test_mean_absolute_error.py
+++ b/tests/ignite/metrics/test_mean_absolute_error.py
@@ -19,7 +19,6 @@ def test_no_update():
 
 @pytest.fixture(params=[item for item in range(4)])
 def test_case(request):
-
     return [
         (torch.randint(0, 10, size=(100, 1)), torch.randint(0, 10, size=(100, 1)), 1),
         (torch.randint(-10, 10, size=(100, 5)), torch.randint(-10, 10, size=(100, 5)), 1),
@@ -31,7 +30,6 @@ def test_case(request):
 
 @pytest.mark.parametrize("n_times", range(5))
 def test_compute(n_times, test_case):
-
     mae = MeanAbsoluteError()
 
     y_pred, y, batch_size = test_case
@@ -61,7 +59,6 @@ def _test_distrib_integration(device):
     rank = idist.get_rank()
 
     def _test(metric_device):
-
         n_iters = 80
         batch_size = 50
         torch.manual_seed(12 + rank)
@@ -99,7 +96,6 @@ def update(engine, i):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
@@ -131,7 +127,6 @@ def test_accumulator_detached():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -140,7 +135,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -150,7 +144,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -162,7 +155,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -172,7 +164,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -182,7 +173,6 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py
index 5aeeb537609f..0a53f48193ea 100644
--- a/tests/ignite/metrics/test_mean_pairwise_distance.py
+++ b/tests/ignite/metrics/test_mean_pairwise_distance.py
@@ -19,7 +19,6 @@ def test_zero_sample():
 
 @pytest.fixture(params=[item for item in range(4)])
 def test_case(request):
-
     return [
         (torch.randint(0, 10, size=(100, 1)), torch.randint(0, 10, size=(100, 1)), 1),
         (torch.randint(-20, 20, size=(100, 5)), torch.randint(-20, 20, size=(100, 5)), 1),
@@ -31,7 +30,6 @@ def test_case(request):
 
 @pytest.mark.parametrize("n_times", range(5))
 def test_compute(n_times, test_case):
-
     mpd = MeanPairwiseDistance()
 
     y_pred, y, batch_size = test_case
@@ -52,14 +50,12 @@ def test_compute(n_times, test_case):
 
 
 def _test_distrib_integration(device):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
     torch.manual_seed(12 + rank)
 
     def _test(metric_device):
-
         n_iters = 100
         batch_size = 50
 
@@ -109,12 +105,10 @@ def update(engine, i):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         mpd = MeanPairwiseDistance(device=metric_device)
         for dev in [mpd._device, mpd._sum_of_distances.device]:
             assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
@@ -141,7 +135,6 @@ def test_accumulator_detached():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -150,7 +143,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -160,7 +152,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -172,7 +163,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -182,7 +172,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py
index d1dd17625584..7bf60889436c 100644
--- a/tests/ignite/metrics/test_mean_squared_error.py
+++ b/tests/ignite/metrics/test_mean_squared_error.py
@@ -30,7 +30,6 @@ def test_case(request):
 
 @pytest.mark.parametrize("n_times", range(5))
 def test_compute(n_times, test_case):
-
     mse = MeanSquaredError()
 
     y_pred, y, batch_size = test_case
@@ -54,7 +53,6 @@ def test_compute(n_times, test_case):
 
 
 def _test_distrib_integration(device, tol=1e-6):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -97,12 +95,10 @@ def update(engine, i):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         device = torch.device(device)
         mse = MeanSquaredError(device=metric_device)
 
@@ -131,7 +127,6 @@ def test_accumulator_detached():
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -140,7 +135,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -150,7 +144,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -162,7 +155,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -172,7 +164,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py
index 10d7a715bdd8..b0ffc1df3a2d 100644
--- a/tests/ignite/metrics/test_metric.py
+++ b/tests/ignite/metrics/test_metric.py
@@ -1,5 +1,6 @@
 import numbers
 import os
+from typing import Dict, List
 from unittest.mock import MagicMock
 
 import numpy as np
@@ -10,8 +11,19 @@
 
 import ignite.distributed as idist
 from ignite.engine import Engine, Events, State
-from ignite.metrics import ConfusionMatrix, Precision, Recall
-from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, reinit__is_reduced, sync_all_reduce
+from ignite.metrics import Accuracy, ConfusionMatrix, Precision, Recall
+from ignite.metrics.metric import (
+    BatchFiltered,
+    BatchWise,
+    EpochWise,
+    Metric,
+    reinit__is_reduced,
+    RunningBatchWise,
+    RunningEpochWise,
+    SingleEpochRunningBatchWise,
+    sync_all_reduce,
+)
+from ignite.utils import _tree_map
 
 
 class DummyMetric1(Metric):
@@ -190,19 +202,19 @@ def compute(self):
     assert m2_times_2.compute() == 200
 
     # __pow__
-    m0_pow_m1 = m0 ** m1
+    m0_pow_m1 = m0**m1
     m0.update([1, 10, 100])
     m1.update([1, 10, 100])
     assert m0_pow_m1.compute() == 1
     m0.update([2, 20, 200])
     m1.update([2, 20, 200])
-    assert m0_pow_m1.compute() == 2 ** 20
+    assert m0_pow_m1.compute() == 2**20
 
-    m2_pow_2 = m2 ** 2
+    m2_pow_2 = m2**2
     m2.update([1, 10, 100])
     assert m2_pow_2.compute() == 10000
 
-    m2_pow_2 = 0.99 ** m2
+    m2_pow_2 = 0.99**m2
     m2.update([1, 10, 100])
     assert m2_pow_2.compute() == 0.3660323412732292
 
@@ -404,7 +416,6 @@ def test_abstract_class():
 
 def test_pytorch_operators():
     def _test(composed_metric, metric_name, compute_true_value_fn):
-
         metrics = {
             metric_name: composed_metric,
         }
@@ -700,28 +711,30 @@ def _test_creating_on_xla_fails(device):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_sync_all_reduce_decorator(device)
     _test_invalid_sync_all_reduce(device)
     _test_compute_with_sync_all_reduce_doesnt_change_attributes(device)
 
+    test_state_dict()
+    test_load_state_dict()
+
 
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_sync_all_reduce_decorator(device)
     _test_invalid_sync_all_reduce(device)
     _test_compute_with_sync_all_reduce_doesnt_change_attributes(device)
+    test_state_dict()
+    test_load_state_dict()
 
 
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = "cpu" if not torch.cuda.is_available() else "cuda"
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -734,7 +747,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_sync_all_reduce_decorator(device)
     _test_invalid_sync_all_reduce(device)
@@ -745,7 +757,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_sync_all_reduce_decorator(device)
     _test_invalid_sync_all_reduce(device)
@@ -818,7 +829,6 @@ def update(self, output):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="Skip if no GPU")
 def test_completed_on_cuda():
-
     # Checks https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919
 
     class DummyMetric(Metric):
@@ -846,80 +856,133 @@ def test_usage_exception():
     m = DummyMetric2()
     with pytest.raises(TypeError, match=r"Unhandled usage type"):
         m.attach(engine, "dummy", usage=1)
-    with pytest.raises(ValueError, match=r"usage should be 'EpochWise.usage_name' or 'BatchWise.usage_name'"):
+    with pytest.raises(
+        ValueError,
+        match=r"usage should be '\(Running\)EpochWise.usage_name' or '\(\(SingleEpoch\)Running\)BatchWise.usage_name'",
+    ):
         m.attach(engine, "dummy", usage="fake")
 
 
-def test_epochwise_usage():
-    class MyMetric(Metric):
-        def __init__(self):
-            super(MyMetric, self).__init__()
-            self.value = []
+class DummyAccumulateInListMetric(Metric):
+    def __init__(self):
+        super(DummyAccumulateInListMetric, self).__init__()
+        self.value = []
 
-        def reset(self):
-            self.value = []
+    def reset(self):
+        self.value = []
 
-        def compute(self):
-            return self.value
+    def compute(self):
+        return self.value
 
-        def update(self, output):
-            self.value.append(output)
+    def update(self, output):
+        self.value.append(output)
 
-    def test(usage):
-        engine = Engine(lambda e, b: b)
 
-        m = MyMetric()
+@pytest.mark.parametrize("usage", ["epoch_wise", EpochWise.usage_name, EpochWise()])
+def test_epochwise_usage(usage):
+    engine = Engine(lambda e, b: b)
 
-        m.attach(engine, "ewm", usage=usage)
+    m = DummyAccumulateInListMetric()
 
-        @engine.on(Events.EPOCH_COMPLETED)
-        def _():
-            ewm = engine.state.metrics["ewm"]
-            assert len(ewm) == 3
-            assert ewm == [0, 1, 2]
+    m.attach(engine, "ewm", usage=usage)
 
-        engine.run([0, 1, 2], max_epochs=10)
-        m.detach(engine, usage=usage)
+    @engine.on(Events.EPOCH_COMPLETED)
+    def _():
+        ewm = engine.state.metrics["ewm"]
+        assert len(ewm) == 3
+        assert ewm == [0, 1, 2]
 
-    test("epoch_wise")
-    test(EpochWise.usage_name)
-    test(EpochWise())
+    engine.run([0, 1, 2], max_epochs=10)
+    m.detach(engine, usage=usage)
 
 
-def test_batchwise_usage():
-    class MyMetric(Metric):
-        def __init__(self):
-            super(MyMetric, self).__init__()
-            self.value = []
+class DummyAccumulateMetric(Metric):
+    def __init__(self):
+        super(DummyAccumulateMetric, self).__init__()
+        self.value = 0
 
-        def reset(self):
-            self.value = []
+    def reset(self):
+        self.value = 0
 
-        def compute(self):
-            return self.value
+    def compute(self):
+        return self.value
 
-        def update(self, output):
-            self.value.append(output)
+    def update(self, output):
+        self.value += output
+
+
+@pytest.mark.parametrize("usage", ["running_epoch_wise", RunningEpochWise.usage_name, RunningEpochWise()])
+def test_running_epochwise_usage(usage):
+    engine = Engine(lambda e, b: e.state.metrics["ewm"])
+
+    engine.state.metrics["ewm"] = 0
+
+    @engine.on(Events.EPOCH_STARTED)
+    def _():
+        engine.state.metrics["ewm"] += 1
 
-    def test(usage):
-        engine = Engine(lambda e, b: b)
+    m = DummyAccumulateMetric()
+    m.attach(engine, "rewm", usage=usage)
 
-        m = MyMetric()
+    @engine.on(Events.EPOCH_COMPLETED)
+    def _():
+        assert engine.state.metrics["rewm"] == sum(range(engine.state.epoch + 1))
 
-        m.attach(engine, "bwm", usage=usage)
+    engine.run([0, 1, 2], max_epochs=10)
 
-        @engine.on(Events.ITERATION_COMPLETED)
-        def _():
-            bwm = engine.state.metrics["bwm"]
-            assert len(bwm) == 1
-            assert bwm[0] == (engine.state.iteration - 1) % 3
+    m.detach(engine, usage=usage)
 
-        engine.run([0, 1, 2], max_epochs=10)
-        m.detach(engine, usage=usage)
 
-    test("batch_wise")
-    test(BatchWise.usage_name)
-    test(BatchWise())
+@pytest.mark.parametrize("usage", ["batch_wise", BatchWise.usage_name, BatchWise()])
+def test_batchwise_usage(usage):
+    engine = Engine(lambda e, b: b)
+
+    m = DummyAccumulateInListMetric()
+
+    m.attach(engine, "bwm", usage=usage)
+
+    @engine.on(Events.ITERATION_COMPLETED)
+    def _():
+        bwm = engine.state.metrics["bwm"]
+        assert len(bwm) == 1
+        assert bwm[0] == (engine.state.iteration - 1) % 3
+
+    engine.run([0, 1, 2], max_epochs=10)
+    m.detach(engine, usage=usage)
+
+
+@pytest.mark.parametrize("usage", ["running_batch_wise", RunningBatchWise.usage_name, RunningBatchWise()])
+def test_running_batchwise_usage(usage):
+    engine = Engine(lambda e, b: b)
+
+    m = DummyAccumulateMetric()
+    m.attach(engine, "rbwm", usage=usage)
+
+    @engine.on(Events.EPOCH_COMPLETED)
+    def _():
+        assert engine.state.metrics["rbwm"] == 6 * engine.state.epoch
+
+    engine.run([0, 1, 2, 3], max_epochs=10)
+
+    m.detach(engine, usage=usage)
+
+
+@pytest.mark.parametrize(
+    "usage", ["single_epoch_running_batch_wise", SingleEpochRunningBatchWise.usage_name, SingleEpochRunningBatchWise()]
+)
+def test_single_epoch_running_batchwise_usage(usage):
+    engine = Engine(lambda e, b: b)
+
+    m = DummyAccumulateMetric()
+    m.attach(engine, "rbwm", usage=usage)
+
+    @engine.on(Events.EPOCH_COMPLETED)
+    def _():
+        assert engine.state.metrics["rbwm"] == 6
+
+    engine.run([0, 1, 2, 3], max_epochs=10)
+
+    m.detach(engine, usage=usage)
 
 
 def test_batchfiltered_usage():
@@ -1069,3 +1132,284 @@ def update(self, output):
 
     with pytest.raises(ValueError, match=r"Output should have 2 items of the same length"):
         engine.run([0] * 10)
+
+
+class DummyMetric4(Metric):
+    _state_dict_all_req_keys = (
+        "dnumber",
+        "fnumber",
+        "tensor",
+        "tensor2",
+        "metric",
+        "metric_dict",
+        "metric_list",
+        "initially_none",
+    )
+
+    @staticmethod
+    def gen_expected_state(value):
+        expected_state = {
+            "dnumber": value + 1,
+            "fnumber": value + 2.234,
+            "tensor": torch.tensor(value + 2.5),
+            "tensor2": torch.tensor(value + 3.5),
+            "metric": {
+                "_num_correct": torch.tensor(value + 3),
+                "_num_examples": value + 4,
+            },
+            "metric_dict": {
+                "m1": {
+                    "_num_correct": torch.tensor(value + 5),
+                    "_num_examples": value + 6,
+                },
+                "m2": {
+                    "_numerator": torch.tensor([value + 7, value + 8]),
+                    "_denominator": torch.tensor([value + 9, value + 10]),
+                    "_weight": value,
+                    "_updated": True,
+                },
+                "n": value + 12,
+            },
+            "metric_list": [
+                {
+                    "_numerator": torch.tensor([value + 11, value + 12]),
+                    "_denominator": torch.tensor([value + 13, value + 14]),
+                    "_weight": value,
+                    "_updated": True,
+                },
+                {
+                    "_numerator": torch.tensor([value + 15, value + 16]),
+                    "_denominator": torch.tensor([value + 17, value + 18]),
+                    "_weight": value,
+                    "_updated": True,
+                },
+                value + 234,
+            ],
+            "initially_none": None,
+        }
+        return expected_state
+
+    def __init__(self, value):
+        super().reset()
+
+        self.expected_state = DummyMetric4.gen_expected_state(value)
+
+        self.dnumber = self.expected_state["dnumber"]
+        self.fnumber = self.expected_state["fnumber"]
+        self.tensor = self.expected_state["tensor"]
+        self.tensor2 = self.expected_state["tensor2"]
+
+        self.metric = Accuracy()
+        self.metric._num_correct = self.expected_state["metric"]["_num_correct"]
+        self.metric._num_examples = self.expected_state["metric"]["_num_examples"]
+
+        self.metric_dict: Dict[str, Metric] = {
+            "m1": Accuracy(),
+            "m2": Precision(),
+            "n": self.expected_state["metric_dict"]["n"],
+        }
+        self.metric_dict["m1"]._num_correct = self.expected_state["metric_dict"]["m1"]["_num_correct"]
+        self.metric_dict["m1"]._num_examples = self.expected_state["metric_dict"]["m1"]["_num_examples"]
+        self.metric_dict["m2"]._numerator = self.expected_state["metric_dict"]["m2"]["_numerator"]
+        self.metric_dict["m2"]._denominator = self.expected_state["metric_dict"]["m2"]["_denominator"]
+        self.metric_dict["m2"]._weight = self.expected_state["metric_dict"]["m2"]["_weight"]
+        self.metric_dict["m2"]._updated = self.expected_state["metric_dict"]["m2"]["_updated"]
+
+        self.metric_list: List[Metric] = [
+            Recall(),
+            Precision(),
+            self.expected_state["metric_list"][2],
+        ]
+        self.metric_list[0]._numerator = self.expected_state["metric_list"][0]["_numerator"]
+        self.metric_list[0]._denominator = self.expected_state["metric_list"][0]["_denominator"]
+        self.metric_list[0]._weight = self.expected_state["metric_list"][0]["_weight"]
+        self.metric_list[0]._updated = self.expected_state["metric_list"][0]["_updated"]
+
+        self.metric_list[1]._numerator = self.expected_state["metric_list"][1]["_numerator"]
+        self.metric_list[1]._denominator = self.expected_state["metric_list"][1]["_denominator"]
+        self.metric_list[1]._weight = self.expected_state["metric_list"][1]["_weight"]
+        self.metric_list[1]._updated = self.expected_state["metric_list"][1]["_updated"]
+
+        self.initially_none = None
+
+    def reset(self):
+        self.dnumber = -1
+        self.fnumber = -2.0
+        self.tensor = torch.tensor([-3])
+        self.tensor2 = 0
+        self.metric.reset()
+        for m in self.metric_dict.values():
+            if isinstance(m, Metric):
+                m.reset()
+        for m in self.metric_list:
+            if isinstance(m, Metric):
+                m.reset()
+        self.initially_none = None
+
+    def update(self, output):
+        pass
+
+    def compute(self):
+        pass
+
+
+def test_wrong_state_dict():
+    class WrongMetric(Metric):
+        _state_dict_all_req_keys = ("object",)
+
+        def __init__(self, value):
+            super().__init__()
+            self.object = value
+
+        def reset(self):
+            pass
+
+        def update(self, output):
+            pass
+
+        def compute(self):
+            pass
+
+    metric = WrongMetric(object())
+    with pytest.raises(TypeError, match="Found attribute of unsupported type. Currently, supported types include"):
+        metric.state_dict()
+
+    delattr(metric, "object")
+    with pytest.raises(ValueError, match="Found a value in _state_dict_all_req_keys that is not among"):
+        metric.state_dict()
+
+
+def test_wrong_load_state_dict():
+    metric = DummyMetric4(1)
+
+    with pytest.raises(TypeError, match="Argument state_dict should be a dictionary"):
+        metric.load_state_dict(123)
+
+    with pytest.raises(ValueError, match="Incorrect state_dict object. Argument state_dict should be a dictionary"):
+        metric.load_state_dict({"abc": 123})
+
+    with pytest.raises(ValueError, match="Expected a list of state_dicts of size equal world_size"):
+        metric.load_state_dict({Metric._Metric__state_dict_key_per_rank: []})
+
+
+# @pytest.mark.distributed
+# @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
+# @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
+# def test_distrib_state_dict_metric_in_metric(distributed_context_single_node_nccl):
+#     class _TestMetric(Metric):
+#         _state_dict_all_req_keys = ("metric", )
+#         def __init__(self):
+#             self.metric = Accuracy()
+
+#         def reset(self):
+#             self.metric.reset()
+
+#         def update(self, output):
+#             self.metric.update(output)
+
+#         def compute(self):
+#             return self.metric.compute()
+
+#     m = _TestMetric()
+#     m.update((
+#         torch.rand(4, 10),
+#         torch.randint(0, 10, size=(4, )),
+#     ))
+
+#     rank = idist.get_rank()
+
+#     import time
+#     time.sleep(rank * 0.1)
+
+#     print("m: ", m.state_dict())
+#     assert False
+
+
+def test_state_dict():
+    metric = DummyMetric4(1)
+    state = metric.state_dict()
+
+    assert isinstance(state, dict) and len(state) == 1 and Metric._Metric__state_dict_key_per_rank in state
+
+    rank = idist.get_rank()
+    ws = idist.get_world_size()
+
+    list_state_dicts = state[Metric._Metric__state_dict_key_per_rank]
+    assert len(list_state_dicts) == ws
+
+    state = list_state_dicts[rank]
+    expected_state = metric.expected_state
+    assert state.keys() == expected_state.keys()
+
+    # Flatten expected state and output state and compare values
+    output_flatten = []
+    expected_flatten = []
+
+    def get_func(flatten):
+        def wrapper(x, key):
+            if isinstance(x, Metric):
+                flatten.extend([(key, getattr(x, k)) for k in x._state_dict_all_req_keys])
+            else:
+                flatten.append((key, x))
+
+        return wrapper
+
+    _tree_map(get_func(expected_flatten), expected_state)
+    _tree_map(get_func(output_flatten), state)
+
+    assert len(output_flatten) == len(expected_flatten) and len(expected_flatten) > 0, (
+        expected_flatten,
+        output_flatten,
+    )
+
+    for key_output, key_expected in zip(output_flatten, expected_flatten):
+        key1, output = key_output
+        key2, expected = key_expected
+        assert key1 == key2, (key1, key2)
+        if isinstance(output, torch.Tensor):
+            assert isinstance(expected, torch.Tensor)
+            assert (output == expected).all(), (output, expected)
+        else:
+            assert output == expected, (output, expected)
+
+
+def test_load_state_dict():
+    metric = DummyMetric4(1)
+    state = metric.state_dict()
+
+    metric.reset()
+    metric.initially_none = 1
+    metric.load_state_dict(state)
+
+    rank = idist.get_rank()
+    world_size = idist.get_world_size()
+    assert len(state[Metric._Metric__state_dict_key_per_rank]) == world_size
+    expected_state = state[Metric._Metric__state_dict_key_per_rank][rank]
+
+    # Flatten expected state and output state and compare values
+    output_flatten = []
+    expected_flatten = []
+
+    def get_func(flatten):
+        def wrapper(x, **kwargs):
+            if isinstance(x, Metric):
+                flatten.extend([getattr(x, k) for k in x._state_dict_all_req_keys])
+            else:
+                flatten.append(x)
+
+        return wrapper
+
+    _tree_map(get_func(expected_flatten), expected_state)
+    _tree_map(get_func(output_flatten), {key: getattr(metric, key) for key in metric._state_dict_all_req_keys})
+
+    assert len(output_flatten) == len(expected_flatten) and len(expected_flatten) > 0, (
+        expected_flatten,
+        output_flatten,
+    )
+
+    for output, expected in zip(output_flatten, expected_flatten):
+        if isinstance(output, torch.Tensor):
+            assert isinstance(expected, torch.Tensor)
+            assert (output == expected).all(), (output, expected)
+        else:
+            assert output == expected, (output, expected)
diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py
index b7a73158c4c3..af142d6ec82a 100644
--- a/tests/ignite/metrics/test_metrics_lambda.py
+++ b/tests/ignite/metrics/test_metrics_lambda.py
@@ -8,7 +8,7 @@
 
 import ignite.distributed as idist
 from ignite.engine import Engine
-from ignite.metrics import Metric, MetricsLambda, Precision, Recall
+from ignite.metrics import Accuracy, Metric, MetricsLambda, Precision, Recall
 
 
 class ListGatherMetric(Metric):
@@ -100,7 +100,6 @@ def fn(x, y, z, t):
 
 
 def test_metrics_lambda_update_and_attach_together():
-
     y_pred = torch.randint(0, 2, size=(15, 10, 4)).float()
     y = torch.randint(0, 2, size=(15, 10, 4)).long()
 
@@ -114,7 +113,7 @@ def update_fn(engine, batch):
     recall = Recall(average=False)
 
     def Fbeta(r, p, beta):
-        return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
+        return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item()
 
     F1 = MetricsLambda(Fbeta, recall, precision, 1)
 
@@ -138,7 +137,6 @@ def Fbeta(r, p, beta):
 
 
 def test_metrics_lambda_update():
-
     """
     Test if the underlying metrics are updated
     """
@@ -149,7 +147,7 @@ def test_metrics_lambda_update():
     recall = Recall(average=False)
 
     def Fbeta(r, p, beta):
-        return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
+        return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item()
 
     F1 = MetricsLambda(Fbeta, recall, precision, 1)
 
@@ -248,7 +246,7 @@ def update_fn(engine, batch):
     recall = Recall(average=False)
 
     def Fbeta(r, p, beta):
-        return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
+        return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item()
 
     F1 = MetricsLambda(Fbeta, recall, precision, 1)
 
@@ -272,9 +270,36 @@ def Fbeta(r, p, beta):
         assert precision_true == approx(precision), f"{precision_true} vs {precision}"
         assert recall_true == approx(recall), f"{recall_true} vs {recall}"
 
+    metric_state = F1.state_dict()
+    F1.reset()
+    F1.load_state_dict(metric_state)
+    f1_value = F1.compute()
+    assert f1_value == state.metrics["f1"]
+
+
+def test_load_state_dict():
+    acc = Accuracy()
+    error = 1.0 - acc
+
+    acc.update(
+        (
+            torch.randint(0, 2, size=(8,)),
+            torch.randint(0, 2, size=(8,)),
+        )
+    )
+
+    e = error.compute()
+    a = acc.compute()
+    assert 1.0 - a == e
+
+    metric_state = error.state_dict()
+    error.reset()
+    error.load_state_dict(metric_state)
+    e2 = error.compute()
+    assert e2 == e
 
-def test_state_metrics():
 
+def test_state_metrics():
     y_pred = torch.randint(0, 2, size=(15, 10, 4)).float()
     y = torch.randint(0, 2, size=(15, 10, 4)).long()
 
@@ -304,7 +329,6 @@ def data(y_pred, y):
 
 
 def test_state_metrics_ingredients_not_attached():
-
     y_pred = torch.randint(0, 2, size=(15, 10, 4)).float()
     y = torch.randint(0, 2, size=(15, 10, 4)).long()
 
@@ -333,7 +357,6 @@ def data(y_pred, y):
 
 def test_recursive_attachment():
     def _test(composed_metric, metric_name, compute_true_value_fn):
-
         metrics = {
             metric_name: composed_metric,
         }
@@ -397,7 +420,6 @@ def compute_true_somemetric(y_pred, y):
 
 
 def _test_distrib_integration(device):
-
     rank = idist.get_rank()
 
     n_iters = 10
@@ -425,7 +447,7 @@ def update_fn(engine, i):
         recall = Recall(average=False, device=metric_device)
 
         def Fbeta(r, p, beta):
-            return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
+            return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item()
 
         F1 = MetricsLambda(Fbeta, recall, precision, 1)
         F1.attach(evaluator, "f1")
@@ -474,7 +496,7 @@ def update(engine, i):
     recall = Recall(average=False, device=device)
 
     def Fbeta(r, p, beta):
-        return torch.mean((1 + beta ** 2) * p * r / (beta ** 2 * p + r)).item()
+        return torch.mean((1 + beta**2) * p * r / (beta**2 * p + r)).item()
 
     F1 = MetricsLambda(Fbeta, recall, precision, 1)
     F1.attach(evaluator, "f1")
@@ -499,7 +521,6 @@ def Fbeta(r, p, beta):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_metrics_on_diff_devices(device)
@@ -508,7 +529,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -517,7 +537,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -529,7 +548,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -538,7 +556,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_metrics_on_diff_devices(device)
diff --git a/tests/ignite/metrics/test_multilabel_confusion_matrix.py b/tests/ignite/metrics/test_multilabel_confusion_matrix.py
index 01c959332fb8..64893768cc4d 100644
--- a/tests/ignite/metrics/test_multilabel_confusion_matrix.py
+++ b/tests/ignite/metrics/test_multilabel_confusion_matrix.py
@@ -190,12 +190,10 @@ def _test(metric_device):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         cm = MultiLabelConfusionMatrix(num_classes=3, device=metric_device)
         assert cm._device == metric_device
         assert (
@@ -233,7 +231,6 @@ def test_simple_2D_input():
 
 
 def test_simple_ND_input():
-
     num_iters = 5
     num_samples = 100
     num_classes = 10
@@ -279,7 +276,6 @@ def test_simple_ND_input():
 
 
 def test_simple_batched():
-
     num_iters = 5
     num_samples = 100
     num_classes = 10
diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py
index f031ff16b1c7..bde62649e4ef 100644
--- a/tests/ignite/metrics/test_precision.py
+++ b/tests/ignite/metrics/test_precision.py
@@ -102,7 +102,6 @@ def ignite_average_to_scikit_average(average, data_type: str):
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"])
 def test_binary_input(average):
-
     pr = Precision(average=average)
     assert pr._updated is False
 
@@ -131,7 +130,6 @@ def _test(y_pred, y, batch_size):
         ) == pytest.approx(pr_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Binary accuracy on input of shape (N, 1) or (N, )
             (torch.randint(0, 2, size=(10,)), torch.randint(0, 2, size=(10,)), 1),
@@ -224,7 +222,6 @@ def test_multiclass_wrong_inputs():
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"])
 def test_multiclass_input(average):
-
     pr = Precision(average=average)
     assert pr._updated is False
 
@@ -255,7 +252,6 @@ def _test(y_pred, y, batch_size):
             assert sk_compute == pytest.approx(pr_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Multiclass input data of shape (N, ) and (N, C)
             (torch.rand(10, 6), torch.randint(0, 6, size=(10,)), 1),
@@ -325,7 +321,6 @@ def to_numpy_multilabel(y):
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted", "samples"])
 def test_multilabel_input(average):
-
     pr = Precision(average=average, is_multilabel=True)
     assert pr._updated is False
 
@@ -353,7 +348,6 @@ def _test(y_pred, y, batch_size):
             assert precision_score(np_y, np_y_pred, average=sk_average_parameter) == pytest.approx(pr_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Multilabel input data of shape (N, C)
             (torch.randint(0, 2, size=(10, 5)), torch.randint(0, 2, size=(10, 5)), 1),
@@ -423,212 +417,210 @@ def test_incorrect_y_classes(average):
     assert pr._updated is False
 
 
-def test_distrib_integration_multiclass(distributed):
-    from ignite.engine import Engine
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration_multiclass(self):
+        from ignite.engine import Engine
 
-    rank = idist.get_rank()
-    torch.manual_seed(12)
+        rank = idist.get_rank()
+        torch.manual_seed(12)
 
-    def _test(average, n_epochs, metric_device):
-        n_iters = 60
-        s = 16
-        n_classes = 7
+        def _test(average, n_epochs, metric_device):
+            n_iters = 60
+            s = 16
+            n_classes = 7
 
-        offset = n_iters * s
-        y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device)
-        y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device)
+            offset = n_iters * s
+            y_true = torch.randint(0, n_classes, size=(offset * idist.get_world_size(),)).to(device)
+            y_preds = torch.rand(offset * idist.get_world_size(), n_classes).to(device)
 
-        def update(engine, i):
-            return (
-                y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :],
-                y_true[i * s + rank * offset : (i + 1) * s + rank * offset],
-            )
+            def update(engine, i):
+                return (
+                    y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, :],
+                    y_true[i * s + rank * offset : (i + 1) * s + rank * offset],
+                )
 
-        engine = Engine(update)
+            engine = Engine(update)
 
-        pr = Precision(average=average, device=metric_device)
-        pr.attach(engine, "pr")
-        assert pr._updated is False
-
-        data = list(range(n_iters))
-        engine.run(data=data, max_epochs=n_epochs)
-
-        assert "pr" in engine.state.metrics
-        assert pr._updated is True
-        res = engine.state.metrics["pr"]
-        if isinstance(res, torch.Tensor):
-            # Fixes https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919
-            assert res.device.type == "cpu"
-            res = res.cpu().numpy()
+            pr = Precision(average=average, device=metric_device)
+            pr.attach(engine, "pr")
+            assert pr._updated is False
 
-        sk_average_parameter = ignite_average_to_scikit_average(average, "multiclass")
-        true_res = precision_score(
-            y_true.cpu().numpy(), torch.argmax(y_preds, dim=1).cpu().numpy(), average=sk_average_parameter
-        )
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=n_epochs)
 
-        assert pytest.approx(res) == true_res
+            assert "pr" in engine.state.metrics
+            assert pr._updated is True
+            res = engine.state.metrics["pr"]
+            if isinstance(res, torch.Tensor):
+                # Fixes https://github.com/pytorch/ignite/issues/1635#issuecomment-863026919
+                assert res.device.type == "cpu"
+                res = res.cpu().numpy()
 
-    metric_devices = [torch.device("cpu")]
-    device = idist.device()
-    if device.type != "xla":
-        metric_devices.append(idist.device())
-    for _ in range(2):
-        for metric_device in metric_devices:
-            _test(average=False, n_epochs=1, metric_device=metric_device)
-            _test(average=False, n_epochs=2, metric_device=metric_device)
-            _test(average="macro", n_epochs=1, metric_device=metric_device)
-            _test(average="macro", n_epochs=2, metric_device=metric_device)
-            _test(average="weighted", n_epochs=1, metric_device=metric_device)
-            _test(average="weighted", n_epochs=2, metric_device=metric_device)
-            _test(average="micro", n_epochs=1, metric_device=metric_device)
-            _test(average="micro", n_epochs=2, metric_device=metric_device)
-
-
-def test_distrib_integration_multilabel(distributed):
-
-    from ignite.engine import Engine
-
-    rank = idist.get_rank()
-    torch.manual_seed(12)
-
-    def _test(average, n_epochs, metric_device):
-        n_iters = 60
-        s = 16
-        n_classes = 7
-
-        offset = n_iters * s
-        y_true = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)
-        y_preds = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)
-
-        def update(engine, i):
-            return (
-                y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
-                y_true[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
+            sk_average_parameter = ignite_average_to_scikit_average(average, "multiclass")
+            true_res = precision_score(
+                y_true.cpu().numpy(), torch.argmax(y_preds, dim=1).cpu().numpy(), average=sk_average_parameter
             )
 
-        engine = Engine(update)
-
-        pr = Precision(average=average, is_multilabel=True, device=metric_device)
-        pr.attach(engine, "pr")
-        assert pr._updated is False
-
-        data = list(range(n_iters))
-        engine.run(data=data, max_epochs=n_epochs)
-
-        assert "pr" in engine.state.metrics
-        assert pr._updated is True
-        res = engine.state.metrics["pr"]
-        res2 = pr.compute()
-        if isinstance(res, torch.Tensor):
-            res = res.cpu().numpy()
-            res2 = res2.cpu().numpy()
-            assert (res == res2).all()
-        else:
-            assert res == res2
-
-        np_y_preds = to_numpy_multilabel(y_preds)
-        np_y_true = to_numpy_multilabel(y_true)
-        assert pr._type == "multilabel"
-        sk_average_parameter = ignite_average_to_scikit_average(average, "multilabel")
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", category=UndefinedMetricWarning)
-            assert precision_score(np_y_true, np_y_preds, average=sk_average_parameter) == pytest.approx(res)
-
-    metric_devices = ["cpu"]
-    device = idist.device()
-    if device.type != "xla":
-        metric_devices.append(idist.device())
-    for _ in range(2):
-        for metric_device in metric_devices:
-            _test(average=False, n_epochs=1, metric_device=metric_device)
-            _test(average=False, n_epochs=2, metric_device=metric_device)
-            _test(average="macro", n_epochs=1, metric_device=metric_device)
-            _test(average="macro", n_epochs=2, metric_device=metric_device)
-            _test(average="micro", n_epochs=1, metric_device=metric_device)
-            _test(average="micro", n_epochs=2, metric_device=metric_device)
-            _test(average="weighted", n_epochs=1, metric_device=metric_device)
-            _test(average="weighted", n_epochs=2, metric_device=metric_device)
-            _test(average="samples", n_epochs=1, metric_device=metric_device)
-            _test(average="samples", n_epochs=2, metric_device=metric_device)
-
-
-def test_distrib_accumulator_device(distributed):
-    # Binary accuracy on input of shape (N, 1) or (N, )
-
-    def _test(average, metric_device):
-        pr = Precision(average=average, device=metric_device)
-        assert pr._device == metric_device
-        assert pr._updated is False
-        # Since the shape of the accumulated amount isn't known before the first update
-        # call, the internal variables aren't tensors on the right device yet.
-
-        y_pred = torch.randint(0, 2, size=(10,))
-        y = torch.randint(0, 2, size=(10,)).long()
-        pr.update((y_pred, y))
-
-        assert pr._updated is True
+            assert pytest.approx(res) == true_res
+
+        metric_devices = [torch.device("cpu")]
+        device = idist.device()
+        if device.type != "xla":
+            metric_devices.append(idist.device())
+        for _ in range(2):
+            for metric_device in metric_devices:
+                _test(average=False, n_epochs=1, metric_device=metric_device)
+                _test(average=False, n_epochs=2, metric_device=metric_device)
+                _test(average="macro", n_epochs=1, metric_device=metric_device)
+                _test(average="macro", n_epochs=2, metric_device=metric_device)
+                _test(average="weighted", n_epochs=1, metric_device=metric_device)
+                _test(average="weighted", n_epochs=2, metric_device=metric_device)
+                _test(average="micro", n_epochs=1, metric_device=metric_device)
+                _test(average="micro", n_epochs=2, metric_device=metric_device)
+
+    def test_integration_multilabel(self):
+        from ignite.engine import Engine
+
+        rank = idist.get_rank()
+        torch.manual_seed(12)
+
+        def _test(average, n_epochs, metric_device):
+            n_iters = 60
+            s = 16
+            n_classes = 7
+
+            offset = n_iters * s
+            y_true = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)
+            y_preds = torch.randint(0, 2, size=(offset * idist.get_world_size(), n_classes, 6, 8)).to(device)
+
+            def update(engine, i):
+                return (
+                    y_preds[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
+                    y_true[i * s + rank * offset : (i + 1) * s + rank * offset, ...],
+                )
+
+            engine = Engine(update)
+
+            pr = Precision(average=average, is_multilabel=True, device=metric_device)
+            pr.attach(engine, "pr")
+            assert pr._updated is False
+
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=n_epochs)
+
+            assert "pr" in engine.state.metrics
+            assert pr._updated is True
+            res = engine.state.metrics["pr"]
+            res2 = pr.compute()
+            if isinstance(res, torch.Tensor):
+                res = res.cpu().numpy()
+                res2 = res2.cpu().numpy()
+                assert (res == res2).all()
+            else:
+                assert res == res2
+
+            np_y_preds = to_numpy_multilabel(y_preds)
+            np_y_true = to_numpy_multilabel(y_true)
+            assert pr._type == "multilabel"
+            sk_average_parameter = ignite_average_to_scikit_average(average, "multilabel")
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", category=UndefinedMetricWarning)
+                assert precision_score(np_y_true, np_y_preds, average=sk_average_parameter) == pytest.approx(res)
+
+        metric_devices = ["cpu"]
+        device = idist.device()
+        if device.type != "xla":
+            metric_devices.append(idist.device())
+        for _ in range(2):
+            for metric_device in metric_devices:
+                _test(average=False, n_epochs=1, metric_device=metric_device)
+                _test(average=False, n_epochs=2, metric_device=metric_device)
+                _test(average="macro", n_epochs=1, metric_device=metric_device)
+                _test(average="macro", n_epochs=2, metric_device=metric_device)
+                _test(average="micro", n_epochs=1, metric_device=metric_device)
+                _test(average="micro", n_epochs=2, metric_device=metric_device)
+                _test(average="weighted", n_epochs=1, metric_device=metric_device)
+                _test(average="weighted", n_epochs=2, metric_device=metric_device)
+                _test(average="samples", n_epochs=1, metric_device=metric_device)
+                _test(average="samples", n_epochs=2, metric_device=metric_device)
+
+    def test_accumulator_device(self):
+        # Binary accuracy on input of shape (N, 1) or (N, )
+
+        def _test(average, metric_device):
+            pr = Precision(average=average, device=metric_device)
+            assert pr._device == metric_device
+            assert pr._updated is False
+            # Since the shape of the accumulated amount isn't known before the first update
+            # call, the internal variables aren't tensors on the right device yet.
+
+            y_pred = torch.randint(0, 2, size=(10,))
+            y = torch.randint(0, 2, size=(10,)).long()
+            pr.update((y_pred, y))
 
-        assert (
-            pr._numerator.device == metric_device
-        ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}"
+            assert pr._updated is True
 
-        if average != "samples":
-            # For average='samples', `_denominator` is of type `int` so it has not `device` member.
             assert (
-                pr._denominator.device == metric_device
-            ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}"
-
-        if average == "weighted":
-            assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs "
-            f"{type(metric_device)}:{metric_device}"
-
-    metric_devices = [torch.device("cpu")]
-    device = idist.device()
-    if device.type != "xla":
-        metric_devices.append(idist.device())
-    for metric_device in metric_devices:
-        _test(False, metric_device=metric_device)
-        _test("macro", metric_device=metric_device)
-        _test("micro", metric_device=metric_device)
-        _test("weighted", metric_device=metric_device)
-
-
-def test_distrib_multilabel_accumulator_device(distributed):
-    # Multiclass input data of shape (N, ) and (N, C)
+                pr._numerator.device == metric_device
+            ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}"
+
+            if average != "samples":
+                # For average='samples', `_denominator` is of type `int` so it has not `device` member.
+                assert (
+                    pr._denominator.device == metric_device
+                ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}"
+
+            if average == "weighted":
+                assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs "
+                f"{type(metric_device)}:{metric_device}"
+
+        metric_devices = [torch.device("cpu")]
+        device = idist.device()
+        if device.type != "xla":
+            metric_devices.append(idist.device())
+        for metric_device in metric_devices:
+            _test(False, metric_device=metric_device)
+            _test("macro", metric_device=metric_device)
+            _test("micro", metric_device=metric_device)
+            _test("weighted", metric_device=metric_device)
 
-    def _test(average, metric_device):
-        pr = Precision(is_multilabel=True, average=average, device=metric_device)
+    def test_multilabel_accumulator_device(self):
+        # Multiclass input data of shape (N, ) and (N, C)
 
-        assert pr._updated is False
-        assert pr._device == metric_device
+        def _test(average, metric_device):
+            pr = Precision(is_multilabel=True, average=average, device=metric_device)
 
-        y_pred = torch.randint(0, 2, size=(10, 4, 20, 23))
-        y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
-        pr.update((y_pred, y))
+            assert pr._updated is False
+            assert pr._device == metric_device
 
-        assert pr._updated is True
+            y_pred = torch.randint(0, 2, size=(10, 4, 20, 23))
+            y = torch.randint(0, 2, size=(10, 4, 20, 23)).long()
+            pr.update((y_pred, y))
 
-        assert (
-            pr._numerator.device == metric_device
-        ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}"
+            assert pr._updated is True
 
-        if average != "samples":
-            # For average='samples', `_denominator` is of type `int` so it has not `device` member.
             assert (
-                pr._denominator.device == metric_device
-            ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}"
-
-        if average == "weighted":
-            assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs "
-            f"{type(metric_device)}:{metric_device}"
-
-    metric_devices = [torch.device("cpu")]
-    device = idist.device()
-    if device.type != "xla":
-        metric_devices.append(idist.device())
-    for metric_device in metric_devices:
-        _test(False, metric_device=metric_device)
-        _test("macro", metric_device=metric_device)
-        _test("micro", metric_device=metric_device)
-        _test("weighted", metric_device=metric_device)
-        _test("samples", metric_device=metric_device)
+                pr._numerator.device == metric_device
+            ), f"{type(pr._numerator.device)}:{pr._numerator.device} vs {type(metric_device)}:{metric_device}"
+
+            if average != "samples":
+                # For average='samples', `_denominator` is of type `int` so it has not `device` member.
+                assert (
+                    pr._denominator.device == metric_device
+                ), f"{type(pr._denominator.device)}:{pr._denominator.device} vs {type(metric_device)}:{metric_device}"
+
+            if average == "weighted":
+                assert pr._weight.device == metric_device, f"{type(pr._weight.device)}:{pr._weight.device} vs "
+                f"{type(metric_device)}:{metric_device}"
+
+        metric_devices = [torch.device("cpu")]
+        device = idist.device()
+        if device.type != "xla":
+            metric_devices.append(idist.device())
+        for metric_device in metric_devices:
+            _test(False, metric_device=metric_device)
+            _test("macro", metric_device=metric_device)
+            _test("micro", metric_device=metric_device)
+            _test("weighted", metric_device=metric_device)
+            _test("samples", metric_device=metric_device)
diff --git a/tests/ignite/metrics/test_psnr.py b/tests/ignite/metrics/test_psnr.py
index ec85111da3e9..1bd06e3e2cb7 100644
--- a/tests/ignite/metrics/test_psnr.py
+++ b/tests/ignite/metrics/test_psnr.py
@@ -112,119 +112,117 @@ def update(engine, i):
     assert np.allclose(result, np_psnr / np_y.shape[0], atol=atol)
 
 
-def test_distrib_input_float(distributed):
-    device = idist.device()
-
-    def get_test_cases():
-
-        y_pred = torch.rand(n_iters * batch_size, 2, 2, device=device)
-        y = y_pred * 0.65
-
-        return y_pred, y
-
-    n_iters = 100
-    batch_size = 10
-
-    rank = idist.get_rank()
-    for i in range(3):
-        # check multiple random inputs as random exact occurencies are rare
-        torch.manual_seed(42 + rank + i)
-        y_pred, y = get_test_cases()
-        _test(y_pred, y, 1, "cpu", n_iters, batch_size, atol=1e-8)
-        if device.type != "xla":
-            _test(y_pred, y, 1, idist.device(), n_iters, batch_size, atol=1e-8)
-
-
-def test_distrib_multilabel_input_YCbCr(distributed):
-    device = idist.device()
-
-    def get_test_cases():
-
-        y_pred = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device)
-        cbcr_pred = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device)
-        y = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device)
-        cbcr = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device)
-
-        y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1)
-
-        return y_pred, y
-
-    n_iters = 100
-    batch_size = 10
-
-    def out_fn(x):
-        return x[0][:, 0, ...], x[1][:, 0, ...]
-
-    rank = idist.get_rank()
-    for i in range(3):
-        # check multiple random inputs as random exact occurencies are rare
-        torch.manual_seed(42 + rank + i)
-        y_pred, y = get_test_cases()
-        _test(y_pred, y, 220, "cpu", n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True)
-        if device.type != "xla":
-            dev = idist.device()
-            _test(y_pred, y, 220, dev, n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True)
-
-
-def test_distrib_multilabel_input_uint8(distributed):
-    device = idist.device()
-
-    def get_test_cases():
-
-        y_pred = torch.randint(0, 256, (n_iters * batch_size, 3, 16, 16), device=device, dtype=torch.uint8)
-        y = (y_pred * 0.65).to(torch.uint8)
-
-        return y_pred, y
-
-    n_iters = 100
-    batch_size = 10
-
-    rank = idist.get_rank()
-    for i in range(3):
-        # check multiple random inputs as random exact occurencies are rare
-        torch.manual_seed(42 + rank + i)
-        y_pred, y = get_test_cases()
-        _test(y_pred, y, 100, "cpu", n_iters, batch_size, atol=1e-8)
-        if device.type != "xla":
-            _test(y_pred, y, 100, idist.device(), n_iters, batch_size, atol=1e-8)
-
-
-def test_distrib_multilabel_input_NHW(distributed):
-    device = idist.device()
-
-    def get_test_cases():
-
-        y_pred = torch.rand(n_iters * batch_size, 28, 28, device=device)
-        y = y_pred * 0.8
-
-        return y_pred, y
-
-    n_iters = 100
-    batch_size = 10
-
-    rank = idist.get_rank()
-    for i in range(3):
-        # check multiple random inputs as random exact occurencies are rare
-        torch.manual_seed(42 + rank + i)
-        y_pred, y = get_test_cases()
-        _test(y_pred, y, 10, "cpu", n_iters, batch_size, atol=1e-8)
-        if device.type != "xla":
-            _test(y_pred, y, 10, idist.device(), n_iters, batch_size, atol=1e-8)
-
-
-def test_distrib_accumulator_device(distributed):
-    device = idist.device()
-    metric_devices = [torch.device("cpu")]
-    if torch.device(device).type != "xla":
-        metric_devices.append(idist.device())
-
-    for metric_device in metric_devices:
-        psnr = PSNR(data_range=1.0, device=metric_device)
-        dev = psnr._device
-        assert dev == metric_device, f"{dev} vs {metric_device}"
-
-        y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device)
-        y = y_pred * 0.65
-        psnr.update((y_pred, y))
-        dev = psnr._sum_of_batchwise_psnr.device
-        assert dev == metric_device, f"{dev} vs {metric_device}"
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_input_float(self):
+        device = idist.device()
+
+        def get_test_cases():
+            y_pred = torch.rand(n_iters * batch_size, 2, 2, device=device)
+            y = y_pred * 0.65
+
+            return y_pred, y
+
+        n_iters = 100
+        batch_size = 10
+
+        rank = idist.get_rank()
+        for i in range(3):
+            # check multiple random inputs as random exact occurencies are rare
+            torch.manual_seed(42 + rank + i)
+            y_pred, y = get_test_cases()
+            _test(y_pred, y, 1, "cpu", n_iters, batch_size, atol=1e-8)
+            if device.type != "xla":
+                _test(y_pred, y, 1, idist.device(), n_iters, batch_size, atol=1e-8)
+
+    def test_multilabel_input_YCbCr(self):
+        device = idist.device()
+
+        def get_test_cases():
+            y_pred = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device)
+            cbcr_pred = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device)
+            y = torch.randint(16, 236, (n_iters * batch_size, 1, 12, 12), dtype=torch.uint8, device=device)
+            cbcr = torch.randint(16, 241, (n_iters * batch_size, 2, 12, 12), dtype=torch.uint8, device=device)
+
+            y_pred, y = torch.cat((y_pred, cbcr_pred), dim=1), torch.cat((y, cbcr), dim=1)
+
+            return y_pred, y
+
+        n_iters = 100
+        batch_size = 10
+
+        def out_fn(x):
+            return x[0][:, 0, ...], x[1][:, 0, ...]
+
+        rank = idist.get_rank()
+        for i in range(3):
+            # check multiple random inputs as random exact occurencies are rare
+            torch.manual_seed(42 + rank + i)
+            y_pred, y = get_test_cases()
+            _test(
+                y_pred, y, 220, "cpu", n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True
+            )
+            if device.type != "xla":
+                dev = idist.device()
+                _test(
+                    y_pred, y, 220, dev, n_iters, batch_size, atol=1e-8, output_transform=out_fn, compute_y_channel=True
+                )
+
+    def test_multilabel_input_uint8(self):
+        device = idist.device()
+
+        def get_test_cases():
+            y_pred = torch.randint(0, 256, (n_iters * batch_size, 3, 16, 16), device=device, dtype=torch.uint8)
+            y = (y_pred * 0.65).to(torch.uint8)
+
+            return y_pred, y
+
+        n_iters = 100
+        batch_size = 10
+
+        rank = idist.get_rank()
+        for i in range(3):
+            # check multiple random inputs as random exact occurencies are rare
+            torch.manual_seed(42 + rank + i)
+            y_pred, y = get_test_cases()
+            _test(y_pred, y, 100, "cpu", n_iters, batch_size, atol=1e-8)
+            if device.type != "xla":
+                _test(y_pred, y, 100, idist.device(), n_iters, batch_size, atol=1e-8)
+
+    def test_multilabel_input_NHW(self):
+        device = idist.device()
+
+        def get_test_cases():
+            y_pred = torch.rand(n_iters * batch_size, 28, 28, device=device)
+            y = y_pred * 0.8
+
+            return y_pred, y
+
+        n_iters = 100
+        batch_size = 10
+
+        rank = idist.get_rank()
+        for i in range(3):
+            # check multiple random inputs as random exact occurencies are rare
+            torch.manual_seed(42 + rank + i)
+            y_pred, y = get_test_cases()
+            _test(y_pred, y, 10, "cpu", n_iters, batch_size, atol=1e-8)
+            if device.type != "xla":
+                _test(y_pred, y, 10, idist.device(), n_iters, batch_size, atol=1e-8)
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if torch.device(device).type != "xla":
+            metric_devices.append(idist.device())
+
+        for metric_device in metric_devices:
+            psnr = PSNR(data_range=1.0, device=metric_device)
+            dev = psnr._device
+            assert dev == metric_device, f"{dev} vs {metric_device}"
+
+            y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device)
+            y = y_pred * 0.65
+            psnr.update((y_pred, y))
+            dev = psnr._sum_of_batchwise_psnr.device
+            assert dev == metric_device, f"{dev} vs {metric_device}"
diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py
index 8aae0df95ec7..de6717d00760 100644
--- a/tests/ignite/metrics/test_recall.py
+++ b/tests/ignite/metrics/test_recall.py
@@ -28,7 +28,6 @@ def test_no_update():
 
 
 def test_average_parameter():
-
     re = Recall(average="samples")
     with pytest.raises(
         ValueError, match=r"Argument average='samples' is incompatible with binary and multiclass input data."
@@ -107,7 +106,6 @@ def ignite_average_to_scikit_average(average, data_type: str):
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"])
 def test_binary_input(average):
-
     re = Recall(average=average)
     assert re._updated is False
 
@@ -134,7 +132,6 @@ def _test(y_pred, y, batch_size):
         assert recall_score(np_y, np_y_pred, average=sk_average_parameter, labels=[0, 1]) == pytest.approx(re_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Binary accuracy on input of shape (N, 1) or (N, )
             (torch.randint(0, 2, size=(10,)), torch.randint(0, 2, size=(10,)), 1),
@@ -227,7 +224,6 @@ def test_multiclass_wrong_inputs():
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "weighted"])
 def test_multiclass_input(average):
-
     re = Recall(average=average)
     assert re._updated is False
 
@@ -258,7 +254,6 @@ def _test(y_pred, y, batch_size):
             assert sk_compute == pytest.approx(re_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Multiclass input data of shape (N, ) and (N, C)
             (torch.rand(10, 6), torch.randint(0, 6, size=(10,)), 1),
@@ -328,7 +323,6 @@ def to_numpy_multilabel(y):
 
 @pytest.mark.parametrize("average", [None, False, "macro", "micro", "samples"])
 def test_multilabel_input(average):
-
     re = Recall(average=average, is_multilabel=True)
     assert re._updated is False
 
@@ -356,7 +350,6 @@ def _test(y_pred, y, batch_size):
             assert recall_score(np_y, np_y_pred, average=sk_average_parameter) == pytest.approx(re_compute)
 
     def get_test_cases():
-
         test_cases = [
             # Multilabel input data of shape (N, C)
             (torch.randint(0, 2, size=(10, 5)), torch.randint(0, 2, size=(10, 5)), 1),
@@ -427,7 +420,6 @@ def test_incorrect_y_classes(average):
 
 
 def _test_distrib_integration_multiclass(device):
-
     from ignite.engine import Engine
 
     def _test(average, n_epochs, metric_device):
@@ -489,7 +481,6 @@ def update(engine, i):
 
 
 def _test_distrib_integration_multilabel(device):
-
     from ignite.engine import Engine
 
     torch.manual_seed(12)
@@ -642,7 +633,6 @@ def _test(average, metric_device):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
@@ -653,7 +643,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
@@ -665,7 +654,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -679,7 +667,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
@@ -691,7 +678,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration_multiclass(device)
     _test_distrib_integration_multilabel(device)
diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py
index ed6cfa5bc5bc..ebdd84aa08d4 100644
--- a/tests/ignite/metrics/test_root_mean_squared_error.py
+++ b/tests/ignite/metrics/test_root_mean_squared_error.py
@@ -30,7 +30,6 @@ def test_data(request):
 
 @pytest.mark.parametrize("n_times", range(3))
 def test_compute(n_times, test_data):
-
     rmse = RootMeanSquaredError()
 
     y_pred, y, batch_size = test_data
@@ -54,7 +53,6 @@ def test_compute(n_times, test_data):
 
 
 def _test_distrib_integration(device, tol=1e-6):
-
     from ignite.engine import Engine
 
     rank = idist.get_rank()
@@ -98,7 +96,6 @@ def update(engine, i):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -106,7 +103,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -115,7 +111,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -126,7 +121,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
 
@@ -135,7 +129,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
 
diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py
index 9d034c1c781b..dc434bda636d 100644
--- a/tests/ignite/metrics/test_running_average.py
+++ b/tests/ignite/metrics/test_running_average.py
@@ -1,5 +1,6 @@
-import os
+import warnings
 from functools import partial
+from itertools import accumulate
 
 import numpy as np
 import pytest
@@ -8,6 +9,7 @@
 import ignite.distributed as idist
 from ignite.engine import Engine, Events
 from ignite.metrics import Accuracy, RunningAverage
+from ignite.metrics.metric import RunningBatchWise, RunningEpochWise, SingleEpochRunningBatchWise
 
 
 def test_wrong_input_args():
@@ -26,171 +28,156 @@ def test_wrong_input_args():
     with pytest.raises(ValueError, match=r"Argument device should be None if src is a Metric"):
         RunningAverage(Accuracy(), device="cpu")
 
+    with pytest.warns(UserWarning, match=r"`epoch_bound` is deprecated and will be removed in the future."):
+        m = RunningAverage(Accuracy(), epoch_bound=True)
 
-def test_integration():
 
-    n_iters = 100
+@pytest.mark.filterwarnings("ignore")
+@pytest.mark.parametrize("epoch_bound, usage", [(False, RunningBatchWise()), (True, SingleEpochRunningBatchWise())])
+def test_epoch_bound(epoch_bound, usage):
+    with warnings.catch_warnings():
+        metric = RunningAverage(output_transform=lambda _: _, epoch_bound=epoch_bound)
+    e1 = Engine(lambda _, __: None)
+    e2 = Engine(lambda _, __: None)
+    metric.attach(e1, "")
+    metric.epoch_bound = None
+    metric.attach(e2, "", usage)
+    e1._event_handlers == e2._event_handlers
+
+
+@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise()])
+def test_integration_batchwise(usage):
+    torch.manual_seed(10)
+    alpha = 0.98
+    n_iters = 10
     batch_size = 10
     n_classes = 10
-    y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size)))
-    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
-    loss_values = iter(range(n_iters))
+    max_epochs = 3
+    data = list(range(n_iters))
+    loss = torch.arange(n_iters, dtype=torch.float)
+    y_true = torch.randint(0, n_classes, size=(n_iters, batch_size))
+    y_pred = torch.rand(n_iters, batch_size, n_classes)
+
+    accuracy_running_averages = torch.tensor(
+        list(
+            accumulate(
+                map(
+                    lambda y_yp: torch.sum(y_yp[1].argmax(dim=-1) == y_yp[0]).item() / y_yp[0].size(0),
+                    zip(
+                        y_true if isinstance(usage, SingleEpochRunningBatchWise) else y_true.repeat(max_epochs, 1),
+                        y_pred if isinstance(usage, SingleEpochRunningBatchWise) else y_pred.repeat(max_epochs, 1, 1),
+                    ),
+                ),
+                lambda ra, acc: ra * alpha + (1 - alpha) * acc,
+            )
+        )
+    )
+    if isinstance(usage, SingleEpochRunningBatchWise):
+        accuracy_running_averages = accuracy_running_averages.repeat(max_epochs)
+
+    loss_running_averages = torch.tensor(
+        list(
+            accumulate(
+                loss if isinstance(usage, SingleEpochRunningBatchWise) else loss.repeat(max_epochs),
+                lambda ra, loss_item: ra * alpha + (1 - alpha) * loss_item,
+            )
+        )
+    )
+    if isinstance(usage, SingleEpochRunningBatchWise):
+        loss_running_averages = loss_running_averages.repeat(max_epochs)
 
-    def update_fn(engine, batch):
-        loss_value = next(loss_values)
-        y_true_batch = next(y_true_batch_values)
-        y_pred_batch = next(y_pred_batch_values)
-        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+    def update_fn(_, i):
+        loss_value = loss[i]
+        y_true_batch = y_true[i]
+        y_pred_batch = y_pred[i]
+        return loss_value, y_pred_batch, y_true_batch
 
     trainer = Engine(update_fn)
-    alpha = 0.98
 
     acc_metric = RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha)
-    acc_metric.attach(trainer, "running_avg_accuracy")
+    acc_metric.attach(trainer, "running_avg_accuracy", usage)
 
     avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha)
-    avg_output.attach(trainer, "running_avg_output")
-
-    running_avg_acc = [
-        None,
-    ]
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def manual_running_avg_acc(engine):
-        _, y_pred, y = engine.state.output
-        indices = torch.max(y_pred, 1)[1]
-        correct = torch.eq(indices, y).view(-1)
-        num_correct = torch.sum(correct).item()
-        num_examples = correct.shape[0]
-        batch_acc = num_correct * 1.0 / num_examples
-        if running_avg_acc[0] is None:
-            running_avg_acc[0] = batch_acc
-        else:
-            running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
-        engine.state.running_avg_acc = running_avg_acc[0]
-
-    @trainer.on(Events.EPOCH_STARTED)
-    def running_avg_output_init(engine):
-        engine.state.running_avg_output = None
+    avg_output.attach(trainer, "running_avg_loss", usage)
 
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def running_avg_output_update(engine):
-        if engine.state.running_avg_output is None:
-            engine.state.running_avg_output = engine.state.output[0]
-        else:
-            engine.state.running_avg_output = (
-                engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0]
-            )
+    metric_acc_running_averages = []
+    metric_loss_running_averages = []
 
     @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_acc_values(engine):
-        assert (
-            engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"]
-        ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_output_values(engine):
-        assert (
-            engine.state.running_avg_output == engine.state.metrics["running_avg_output"]
-        ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"
-
-    np.random.seed(10)
-    running_avg_acc = [
-        None,
-    ]
-    n_iters = 10
-    batch_size = 10
-    n_classes = 10
-    data = list(range(n_iters))
-    loss_values = iter(range(n_iters))
-    y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size)))
-    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
-    trainer.run(data, max_epochs=1)
-
-    running_avg_acc = [
-        None,
-    ]
-    n_iters = 10
-    batch_size = 10
-    n_classes = 10
-    data = list(range(n_iters))
-    loss_values = iter(range(n_iters))
-    y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_iters, batch_size)))
-    y_pred_batch_values = iter(np.random.rand(n_iters, batch_size, n_classes))
-    trainer.run(data, max_epochs=1)
-
+    def _(engine):
+        metric_acc_running_averages.append(engine.state.metrics["running_avg_accuracy"])
+        metric_loss_running_averages.append(engine.state.metrics["running_avg_loss"])
 
-def test_epoch_unbound():
+    trainer.run(data, max_epochs=3)
 
+    assert (torch.tensor(metric_acc_running_averages) == accuracy_running_averages).all()
+    assert (torch.tensor(metric_loss_running_averages) == loss_running_averages).all()
+
+    metric_state = acc_metric.state_dict()
+    saved__value = acc_metric._value
+    saved_src__num_correct = acc_metric.src._num_correct
+    saved_src__num_examples = acc_metric.src._num_examples
+    acc_metric.reset()
+    acc_metric.load_state_dict(metric_state)
+    assert acc_metric._value == saved__value
+    assert acc_metric.src._num_examples == saved_src__num_examples
+    assert (acc_metric.src._num_correct == saved_src__num_correct).all()
+
+    metric_state = avg_output.state_dict()
+    saved__value = avg_output._value
+    assert avg_output.src is None
+    avg_output.reset()
+    avg_output.load_state_dict(metric_state)
+    assert avg_output._value == saved__value
+    assert avg_output.src is None
+
+
+def test_integration_epochwise():
+    torch.manual_seed(10)
+    alpha = 0.98
     n_iters = 10
-    n_epochs = 3
     batch_size = 10
     n_classes = 10
+    max_epochs = 3
     data = list(range(n_iters))
-    loss_values = iter(range(n_epochs * n_iters))
-    y_true_batch_values = iter(np.random.randint(0, n_classes, size=(n_epochs * n_iters, batch_size)))
-    y_pred_batch_values = iter(np.random.rand(n_epochs * n_iters, batch_size, n_classes))
+    y_true = torch.randint(0, n_classes, size=(n_iters, batch_size))
+    y_pred = torch.rand(max_epochs, n_iters, batch_size, n_classes)
+
+    accuracy_running_averages = torch.tensor(
+        list(
+            accumulate(
+                map(
+                    lambda y_pred_epoch: torch.sum(y_pred_epoch.argmax(dim=-1) == y_true).item() / y_true.numel(),
+                    y_pred,
+                ),
+                lambda ra, acc: ra * alpha + (1 - alpha) * acc,
+            )
+        )
+    )
 
-    def update_fn(engine, batch):
-        loss_value = next(loss_values)
-        y_true_batch = next(y_true_batch_values)
-        y_pred_batch = next(y_pred_batch_values)
-        return loss_value, torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+    def update_fn(engine, i):
+        y_true_batch = y_true[i]
+        y_pred_batch = y_pred[engine.state.epoch - 1, i]
+        return y_pred_batch, y_true_batch
 
     trainer = Engine(update_fn)
-    alpha = 0.98
-
-    acc_metric = RunningAverage(Accuracy(output_transform=lambda x: [x[1], x[2]]), alpha=alpha, epoch_bound=False)
-    acc_metric.attach(trainer, "running_avg_accuracy")
 
-    avg_output = RunningAverage(output_transform=lambda x: x[0], alpha=alpha, epoch_bound=False)
-    avg_output.attach(trainer, "running_avg_output")
+    acc_metric = RunningAverage(Accuracy(), alpha=alpha)
+    acc_metric.attach(trainer, "running_avg_accuracy", RunningEpochWise())
 
-    running_avg_acc = [None]
+    metric_acc_running_averages = []
 
-    @trainer.on(Events.STARTED)
-    def running_avg_output_init(engine):
-        engine.state.running_avg_output = None
-
-    @trainer.on(Events.ITERATION_COMPLETED, running_avg_acc)
-    def manual_running_avg_acc(engine, running_avg_acc):
-        _, y_pred, y = engine.state.output
-        indices = torch.max(y_pred, 1)[1]
-        correct = torch.eq(indices, y).view(-1)
-        num_correct = torch.sum(correct).item()
-        num_examples = correct.shape[0]
-        batch_acc = num_correct * 1.0 / num_examples
-        if running_avg_acc[0] is None:
-            running_avg_acc[0] = batch_acc
-        else:
-            running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
-        engine.state.running_avg_acc = running_avg_acc[0]
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def running_avg_output_update(engine):
-        if engine.state.running_avg_output is None:
-            engine.state.running_avg_output = engine.state.output[0]
-        else:
-            engine.state.running_avg_output = (
-                engine.state.running_avg_output * alpha + (1.0 - alpha) * engine.state.output[0]
-            )
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_acc_values(engine):
-        assert (
-            engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"]
-        ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_output_values(engine):
-        assert (
-            engine.state.running_avg_output == engine.state.metrics["running_avg_output"]
-        ), f"{engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"
+    @trainer.on(Events.EPOCH_COMPLETED)
+    def _(engine):
+        metric_acc_running_averages.append(engine.state.metrics["running_avg_accuracy"])
 
     trainer.run(data, max_epochs=3)
 
+    assert (torch.tensor(metric_acc_running_averages) == accuracy_running_averages).all()
+
 
-def test_multiple_attach():
+@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()])
+def test_multiple_attach(usage):
     n_iters = 100
     errD_values = iter(np.random.rand(n_iters))
     errG_values = iter(np.random.rand(n_iters))
@@ -214,11 +201,10 @@ def update_fn(engine, batch):
     monitoring_metrics = ["errD", "errG", "D_x", "D_G_z1", "D_G_z2"]
     for metric in monitoring_metrics:
         foo = partial(lambda x, metric: x[metric], metric=metric)
-        RunningAverage(alpha=alpha, output_transform=foo).attach(trainer, metric)
+        RunningAverage(alpha=alpha, output_transform=foo).attach(trainer, metric, usage)
 
-    @trainer.on(Events.ITERATION_COMPLETED)
+    @trainer.on(usage.COMPLETED)
     def check_values(engine):
-
         values = []
         for metric in monitoring_metrics:
             values.append(engine.state.metrics[metric])
@@ -230,8 +216,23 @@ def check_values(engine):
     trainer.run(data)
 
 
-def test_output_is_tensor():
+@pytest.mark.filterwarnings("ignore")
+@pytest.mark.parametrize("epoch_bound", [True, False, None])
+@pytest.mark.parametrize("src", [Accuracy(), None])
+@pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()])
+def test_detach(epoch_bound, src, usage):
+    with warnings.catch_warnings():
+        m = RunningAverage(src, output_transform=(lambda _: _) if src is None else None, epoch_bound=epoch_bound)
+    e = Engine(lambda _, __: None)
+    m.attach(e, "m", usage)
+    for event_handlers in e._event_handlers.values():
+        assert len(event_handlers) != 0
+    m.detach(e, usage)
+    for event_handlers in e._event_handlers.values():
+        assert len(event_handlers) == 0
+
 
+def test_output_is_tensor():
     m = RunningAverage(output_transform=lambda x: x)
     m.update(torch.rand(10, requires_grad=True).mean())
     v = m.compute()
@@ -249,222 +250,147 @@ def test_output_is_tensor():
     assert not v.requires_grad
 
 
-def _test_distrib_on_output(device):
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    @pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise()])
+    def test_src_is_output(self, usage):
+        device = idist.device()
+        rank = idist.get_rank()
+        n_iters = 10
+        n_epochs = 3
 
-    rank = idist.get_rank()
-    n_iters = 10
-    n_epochs = 3
-    batch_size = 10
-
-    # Data per rank
-    data = list(range(n_iters))
-    k = n_epochs * batch_size * n_iters
-    all_loss_values = torch.arange(0, k * idist.get_world_size(), dtype=torch.float64).to(device)
-    loss_values = iter(all_loss_values[k * rank : k * (rank + 1)])
-
-    def update_fn(engine, batch):
-        loss_value = next(loss_values)
-        return loss_value.item()
-
-    trainer = Engine(update_fn)
-    alpha = 0.98
-
-    metric_device = idist.device() if torch.device(device).type != "xla" else "cpu"
-    avg_output = RunningAverage(output_transform=lambda x: x, alpha=alpha, epoch_bound=False, device=metric_device)
-    avg_output.attach(trainer, "running_avg_output")
-
-    @trainer.on(Events.STARTED)
-    def running_avg_output_init(engine):
-        engine.state.running_avg_output = None
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def running_avg_output_update(engine):
-        i = engine.state.iteration - 1
-        o = sum([all_loss_values[i + j * k] for j in range(idist.get_world_size())]).item()
-        o /= idist.get_world_size()
-        if engine.state.running_avg_output is None:
-            engine.state.running_avg_output = o
-        else:
-            engine.state.running_avg_output = engine.state.running_avg_output * alpha + (1.0 - alpha) * o
-
-    @trainer.on(Events.ITERATION_COMPLETED)
-    def assert_equal_running_avg_output_values(engine):
-        it = engine.state.iteration
-        assert engine.state.running_avg_output == pytest.approx(
-            engine.state.metrics["running_avg_output"]
-        ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"
-
-    trainer.run(data, max_epochs=3)
-
-
-def _test_distrib_on_metric(device):
-
-    rank = idist.get_rank()
-    n_iters = 10
-    n_epochs = 3
-    batch_size = 10
-    n_classes = 10
-
-    def _test(metric_device):
+        # Data per rank
         data = list(range(n_iters))
-        np.random.seed(12)
-        all_y_true_batch_values = np.random.randint(
-            0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size)
-        )
-        all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes)
-
-        y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
-        y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])
+        rank_loss_count = n_epochs * n_iters
+        all_loss_values = torch.arange(0, rank_loss_count * idist.get_world_size(), dtype=torch.float64).to(device)
+        loss_values = iter(all_loss_values[rank_loss_count * rank : rank_loss_count * (rank + 1)])
 
         def update_fn(engine, batch):
-            y_true_batch = next(y_true_batch_values)
-            y_pred_batch = next(y_pred_batch_values)
-            return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+            loss_value = next(loss_values)
+            return loss_value.item()
 
         trainer = Engine(update_fn)
         alpha = 0.98
 
-        acc_metric = RunningAverage(
-            Accuracy(output_transform=lambda x: [x[0], x[1]], device=metric_device), alpha=alpha, epoch_bound=False
-        )
-        acc_metric.attach(trainer, "running_avg_accuracy")
+        metric_device = device if device.type != "xla" else "cpu"
+        avg_output = RunningAverage(output_transform=lambda x: x, alpha=alpha, device=metric_device)
+        avg_output.attach(trainer, "running_avg_output", usage)
 
-        running_avg_acc = [
-            None,
-        ]
-        true_acc_metric = Accuracy(device=metric_device)
+        @trainer.on(usage.STARTED)
+        def reset_running_avg_output(engine):
+            engine.state.running_avg_output = None
 
-        @trainer.on(Events.ITERATION_COMPLETED)
-        def manual_running_avg_acc(engine):
+        @trainer.on(usage.ITERATION_COMPLETED)
+        def running_avg_output_update(engine):
             i = engine.state.iteration - 1
-
-            true_acc_metric.reset()
-            for j in range(idist.get_world_size()):
-                output = (
-                    torch.from_numpy(all_y_pred_batch_values[j, i, :, :]),
-                    torch.from_numpy(all_y_true_batch_values[j, i, :]),
-                )
-                true_acc_metric.update(output)
-
-            batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples
-
-            if running_avg_acc[0] is None:
-                running_avg_acc[0] = batch_acc
+            o = sum([all_loss_values[i + r * rank_loss_count] for r in range(idist.get_world_size())]).item()
+            o /= idist.get_world_size()
+            if engine.state.running_avg_output is None:
+                engine.state.running_avg_output = o
             else:
-                running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
-            engine.state.running_avg_acc = running_avg_acc[0]
+                engine.state.running_avg_output = engine.state.running_avg_output * alpha + (1.0 - alpha) * o
 
-        @trainer.on(Events.ITERATION_COMPLETED)
-        def assert_equal_running_avg_acc_values(engine):
+        @trainer.on(usage.COMPLETED)
+        def assert_equal_running_avg_output_values(engine):
+            it = engine.state.iteration
             assert (
-                engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"]
-            ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"
+                engine.state.running_avg_output == engine.state.metrics["running_avg_output"]
+            ), f"{it}: {engine.state.running_avg_output} vs {engine.state.metrics['running_avg_output']}"
 
         trainer.run(data, max_epochs=3)
 
-    _test("cpu")
-    if device.type != "xla":
-        _test(idist.device())
-
-
-def _test_distrib_accumulator_device(device):
-
-    metric_devices = [torch.device("cpu")]
-    if device.type != "xla":
-        metric_devices.append(idist.device())
-    for metric_device in metric_devices:
-
-        # Don't test the src=Metric case because compute() returns a scalar,
-        # so the metric doesn't accumulate on the device specified
-        avg = RunningAverage(output_transform=lambda x: x, device=metric_device)
-        assert avg._device == metric_device
-        # Value is None until the first update then compute call
-
-        for _ in range(3):
-            avg.update(torch.tensor(1.0, device=device))
-            avg.compute()
-
-            assert (
-                avg._value.device == metric_device
-            ), f"{type(avg._value.device)}:{avg._value.device} vs {type(metric_device)}:{metric_device}"
-
-
-@pytest.mark.distributed
-@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-@pytest.mark.distributed
-@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
-def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-@pytest.mark.distributed
-@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
-@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-def test_distrib_hvd(gloo_hvd_executor):
-
-    device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
-    nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
-
-    gloo_hvd_executor(_test_distrib_on_output, (device,), np=nproc, do_init=True)
-    gloo_hvd_executor(_test_distrib_on_metric, (device,), np=nproc, do_init=True)
-    gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)
-
-
-@pytest.mark.multinode_distributed
-@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
-def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-@pytest.mark.multinode_distributed
-@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
-@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
-def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-@pytest.mark.tpu
-@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
-@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
-def test_distrib_single_device_xla():
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-def _test_distrib_xla_nprocs(index):
-    device = idist.device()
-    _test_distrib_on_output(device)
-    _test_distrib_on_metric(device)
-    _test_distrib_accumulator_device(device)
-
-
-@pytest.mark.tpu
-@pytest.mark.skipif("NUM_TPU_WORKERS" not in os.environ, reason="Skip if no NUM_TPU_WORKERS in env vars")
-@pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
-def test_distrib_xla_nprocs(xmp_executor):
-    n = int(os.environ["NUM_TPU_WORKERS"])
-    xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n)
+    @pytest.mark.parametrize("usage", [RunningBatchWise(), SingleEpochRunningBatchWise(), RunningEpochWise()])
+    def test_src_is_metric(self, usage):
+        device = idist.device()
+        rank = idist.get_rank()
+        n_iters = 10
+        n_epochs = 3
+        batch_size = 10
+        n_classes = 10
+
+        def _test(metric_device):
+            data = list(range(n_iters))
+            np.random.seed(12)
+            all_y_true_batch_values = np.random.randint(
+                0, n_classes, size=(idist.get_world_size(), n_epochs * n_iters, batch_size)
+            )
+            all_y_pred_batch_values = np.random.rand(idist.get_world_size(), n_epochs * n_iters, batch_size, n_classes)
+
+            y_true_batch_values = iter(all_y_true_batch_values[rank, ...])
+            y_pred_batch_values = iter(all_y_pred_batch_values[rank, ...])
+
+            def update_fn(engine, batch):
+                y_true_batch = next(y_true_batch_values)
+                y_pred_batch = next(y_pred_batch_values)
+                return torch.from_numpy(y_pred_batch), torch.from_numpy(y_true_batch)
+
+            trainer = Engine(update_fn)
+            alpha = 0.98
+
+            acc_metric = RunningAverage(Accuracy(device=metric_device), alpha=alpha)
+            acc_metric.attach(trainer, "running_avg_accuracy", usage)
+
+            running_avg_acc = [
+                None,
+            ]
+            true_acc_metric = Accuracy(device=metric_device)
+
+            @trainer.on(Events.ITERATION_COMPLETED)
+            def manual_running_avg_acc(engine):
+                iteration = engine.state.iteration
+
+                if not isinstance(usage, RunningEpochWise) or ((iteration - 1) % n_iters) == 0:
+                    true_acc_metric.reset()
+                if ((iteration - 1) % n_iters) == 0 and isinstance(usage, SingleEpochRunningBatchWise):
+                    running_avg_acc[0] = None
+                for j in range(idist.get_world_size()):
+                    output = (
+                        torch.from_numpy(all_y_pred_batch_values[j, iteration - 1, :, :]),
+                        torch.from_numpy(all_y_true_batch_values[j, iteration - 1, :]),
+                    )
+                    true_acc_metric.update(output)
+
+                if not isinstance(usage, RunningEpochWise) or (iteration % n_iters) == 0:
+                    batch_acc = true_acc_metric._num_correct.item() * 1.0 / true_acc_metric._num_examples
+
+                    if running_avg_acc[0] is None:
+                        running_avg_acc[0] = batch_acc
+                    else:
+                        running_avg_acc[0] = running_avg_acc[0] * alpha + (1.0 - alpha) * batch_acc
+                    engine.state.running_avg_acc = running_avg_acc[0]
+
+            @trainer.on(Events.ITERATION_COMPLETED)
+            def assert_equal_running_avg_acc_values(engine):
+                print(engine.state.iteration)
+                if not isinstance(usage, RunningEpochWise) or (
+                    (engine.state.iteration > 1) and ((engine.state.iteration % n_iters) == 1)
+                ):
+                    assert (
+                        engine.state.running_avg_acc == engine.state.metrics["running_avg_accuracy"]
+                    ), f"{engine.state.running_avg_acc} vs {engine.state.metrics['running_avg_accuracy']}"
+
+            trainer.run(data, max_epochs=3)
+
+        _test("cpu")
+        if device.type != "xla":
+            _test(idist.device())
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(idist.device())
+        for metric_device in metric_devices:
+            # Don't test the src=Metric case because compute() returns a scalar,
+            # so the metric doesn't accumulate on the device specified
+            avg = RunningAverage(output_transform=lambda x: x, device=metric_device)
+            assert avg._device == metric_device
+            # Value is None until the first update then compute call
+
+            for _ in range(3):
+                avg.update(torch.tensor(1.0, device=device))
+                avg.compute()
+
+                assert (
+                    avg._value.device == metric_device
+                ), f"{type(avg._value.device)}:{avg._value.device} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/metrics/test_ssim.py b/tests/ignite/metrics/test_ssim.py
index 0c27f119a3be..33f6f3aa3d55 100644
--- a/tests/ignite/metrics/test_ssim.py
+++ b/tests/ignite/metrics/test_ssim.py
@@ -1,3 +1,5 @@
+from typing import Sequence, Union
+
 import numpy as np
 import pytest
 import torch
@@ -74,16 +76,45 @@ def test_ssim(available_device, shape, kernel_size, gaussian, use_sample_covaria
     y_pred = torch.rand(shape, device=available_device)
     y = y_pred * 0.8
 
+    compare_ssim_ignite_skiimg(
+        y_pred,
+        y,
+        available_device,
+        kernel_size=kernel_size,
+        gaussian=gaussian,
+        use_sample_covariance=use_sample_covariance,
+    )
+
+
+def compare_ssim_ignite_skiimg(
+    y_pred: torch.Tensor,
+    y: torch.Tensor,
+    device: torch.device,
+    precision: float = 2e-5,  # default to float32 expected precision
+    *,
+    skimg_y_pred: Union[np.ndarray, None] = None,
+    skimg_y: Union[np.ndarray, None] = None,
+    data_range: float = 1.0,
+    kernel_size: Union[int, Sequence[int]] = 11,
+    gaussian: bool = True,
+    use_sample_covariance: bool = False,
+):
     sigma = 1.5
-    data_range = 1.0
-    ssim = SSIM(data_range=data_range, sigma=sigma, device=available_device)
+
+    ssim = SSIM(data_range=data_range, sigma=sigma, device=device)
     ssim.update((y_pred, y))
     ignite_ssim = ssim.compute()
 
-    skimg_pred = y_pred.cpu().numpy()
-    skimg_y = skimg_pred * 0.8
+    if y_pred.dtype == torch.bfloat16:
+        y_pred = y_pred.to(dtype=torch.float16)
+
+    if skimg_y_pred is None:
+        skimg_y_pred = y_pred.cpu().numpy()
+    if skimg_y is None:
+        skimg_y = skimg_y_pred * 0.8
+
     skimg_ssim = ski_ssim(
-        skimg_pred,
+        skimg_y_pred,
         skimg_y,
         win_size=kernel_size,
         sigma=sigma,
@@ -94,7 +125,44 @@ def test_ssim(available_device, shape, kernel_size, gaussian, use_sample_covaria
     )
 
     assert isinstance(ignite_ssim, float)
-    assert np.allclose(ignite_ssim, skimg_ssim, atol=7e-5)
+    assert np.allclose(ignite_ssim, skimg_ssim, atol=precision)
+
+
+@pytest.mark.parametrize(
+    "metric_device, y_pred_device",
+    [
+        [torch.device("cpu"), torch.device("cpu")],
+        [torch.device("cpu"), torch.device("cuda")],
+        [torch.device("cuda"), torch.device("cpu")],
+        [torch.device("cuda"), torch.device("cuda")],
+    ],
+)
+def test_ssim_device(available_device, metric_device, y_pred_device):
+    if available_device == "cpu":
+        pytest.skip("This test requires a cuda device.")
+
+    data_range = 1.0
+    sigma = 1.5
+    shape = (12, 5, 256, 256)
+
+    ssim = SSIM(data_range=data_range, sigma=sigma, device=metric_device)
+
+    y_pred = torch.rand(shape, device=y_pred_device)
+    y = y_pred * 0.8
+
+    if metric_device == torch.device("cuda") and y_pred_device == torch.device("cpu"):
+        with pytest.warns(UserWarning):
+            ssim.update((y_pred, y))
+    else:
+        ssim.update((y_pred, y))
+
+    if metric_device == torch.device("cuda") or y_pred_device == torch.device("cuda"):
+        # A tensor will always have the device index set
+        excepted_device = torch.device("cuda:0")
+    else:
+        excepted_device = torch.device("cpu")
+
+    assert ssim._kernel.device == excepted_device
 
 
 def test_ssim_variable_batchsize(available_device):
@@ -123,87 +191,152 @@ def test_ssim_variable_batchsize(available_device):
     assert np.allclose(out, expected)
 
 
-@pytest.mark.parametrize("metric_device", ["cpu", "process_device"])
-def test_distrib_integration(distributed, metric_device):
-    from ignite.engine import Engine
+def test_ssim_variable_channel(available_device):
+    y_preds = [
+        torch.rand(12, 5, 28, 28, device=available_device),
+        torch.rand(12, 4, 28, 28, device=available_device),
+        torch.rand(12, 7, 28, 28, device=available_device),
+        torch.rand(12, 3, 28, 28, device=available_device),
+        torch.rand(12, 11, 28, 28, device=available_device),
+        torch.rand(12, 6, 28, 28, device=available_device),
+    ]
+    y_true = [v * 0.8 for v in y_preds]
+
+    for y_pred, y in zip(y_preds, y_true):
+        compare_ssim_ignite_skiimg(y_pred, y, available_device)
+
 
-    rank = idist.get_rank()
-    torch.manual_seed(12 + rank)
-    n_iters = 100
-    batch_size = 10
-    device = idist.device()
-    if metric_device == "process_device":
-        metric_device = device if device.type != "xla" else "cpu"
+@pytest.mark.parametrize(
+    "dtype, precision", [(torch.bfloat16, 2e-3), (torch.float16, 4e-4), (torch.float32, 2e-5), (torch.float64, 2e-5)]
+)
+def test_cuda_ssim_dtypes(available_device, dtype, precision):
+    # Checks https://github.com/pytorch/ignite/pull/3034
+    if available_device == "cpu" and dtype in [torch.float16, torch.bfloat16]:
+        pytest.skip(reason=f"Unsupported dtype {dtype} on CPU device")
 
-    y_pred = torch.rand(n_iters * batch_size, 3, 28, 28, dtype=torch.float, device=device)
-    y = y_pred * 0.65
+    shape = (12, 3, 28, 28)
 
-    def update(engine, i):
-        return (
-            y_pred[i * batch_size : (i + 1) * batch_size, ...],
-            y[i * batch_size : (i + 1) * batch_size, ...],
-        )
+    y_pred = torch.rand(shape, device=available_device, dtype=dtype)
+    y = y_pred * 0.8
 
-    engine = Engine(update)
-    SSIM(data_range=1.0, device=metric_device).attach(engine, "ssim")
+    compare_ssim_ignite_skiimg(y_pred, y, available_device, precision)
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
 
-    y_pred = idist.all_gather(y_pred)
-    y = idist.all_gather(y)
+@pytest.mark.parametrize(
+    "shape, kernel_size, gaussian, use_sample_covariance",
+    [[(8, 3, 224, 224), 7, False, True], [(12, 3, 28, 28), 11, True, False]],
+)
+def test_ssim_uint8(available_device, shape, kernel_size, gaussian, use_sample_covariance):
+    y_pred = torch.randint(0, 255, shape, device=available_device, dtype=torch.uint8)
+    y = (y_pred * 0.8).to(dtype=torch.uint8)
 
-    assert "ssim" in engine.state.metrics
-    res = engine.state.metrics["ssim"]
+    sigma = 1.5
+    data_range = 255
+    ssim = SSIM(data_range=data_range, sigma=sigma, device=available_device)
+    ssim.update((y_pred, y))
+    ignite_ssim = ssim.compute()
 
-    np_pred = y_pred.cpu().numpy()
-    np_true = np_pred * 0.65
-    true_res = ski_ssim(
-        np_pred,
-        np_true,
-        win_size=11,
-        sigma=1.5,
+    skimg_pred = y_pred.cpu().numpy()
+    skimg_y = (skimg_pred * 0.8).astype(np.uint8)
+    skimg_ssim = ski_ssim(
+        skimg_pred,
+        skimg_y,
+        win_size=kernel_size,
+        sigma=sigma,
         channel_axis=1,
-        gaussian_weights=True,
-        data_range=1.0,
-        use_sample_covariance=False,
+        gaussian_weights=gaussian,
+        data_range=data_range,
+        use_sample_covariance=use_sample_covariance,
     )
 
-    tol = 1e-3 if device.type == "xla" else 1e-4  # Isn't better to ask `distributed` about backend info?
+    assert isinstance(ignite_ssim, float)
+    assert np.allclose(ignite_ssim, skimg_ssim, atol=1e-5)
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    @pytest.mark.parametrize("metric_device", ["cpu", "process_device"])
+    def test_integration(self, metric_device):
+        from ignite.engine import Engine
+
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+        n_iters = 100
+        batch_size = 10
+        device = idist.device()
+        if metric_device == "process_device":
+            metric_device = device if device.type != "xla" else "cpu"
+
+        y_pred = torch.rand(n_iters * batch_size, 3, 28, 28, dtype=torch.float, device=device)
+        y = y_pred * 0.65
+
+        def update(engine, i):
+            return (
+                y_pred[i * batch_size : (i + 1) * batch_size, ...],
+                y[i * batch_size : (i + 1) * batch_size, ...],
+            )
+
+        engine = Engine(update)
+        SSIM(data_range=1.0, device=metric_device).attach(engine, "ssim")
+
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
+
+        y_pred = idist.all_gather(y_pred)
+        y = idist.all_gather(y)
+
+        assert "ssim" in engine.state.metrics
+        res = engine.state.metrics["ssim"]
+
+        np_pred = y_pred.cpu().numpy()
+        np_true = np_pred * 0.65
+        true_res = ski_ssim(
+            np_pred,
+            np_true,
+            win_size=11,
+            sigma=1.5,
+            channel_axis=1,
+            gaussian_weights=True,
+            data_range=1.0,
+            use_sample_covariance=False,
+        )
 
-    assert pytest.approx(res, abs=tol) == true_res
+        tol = 1e-3 if device.type == "xla" else 1e-4  # Isn't better to ask `distributed` about backend info?
 
-    engine = Engine(update)
-    SSIM(data_range=1.0, gaussian=False, kernel_size=7, device=metric_device).attach(engine, "ssim")
+        assert pytest.approx(res, abs=tol) == true_res
 
-    data = list(range(n_iters))
-    engine.run(data=data, max_epochs=1)
+        engine = Engine(update)
+        SSIM(data_range=1.0, gaussian=False, kernel_size=7, device=metric_device).attach(engine, "ssim")
 
-    assert "ssim" in engine.state.metrics
-    res = engine.state.metrics["ssim"]
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=1)
 
-    np_pred = y_pred.cpu().numpy()
-    np_true = np_pred * 0.65
-    true_res = ski_ssim(np_pred, np_true, win_size=7, channel_axis=1, gaussian_weights=False, data_range=1.0)
+        assert "ssim" in engine.state.metrics
+        res = engine.state.metrics["ssim"]
 
-    assert pytest.approx(res, abs=tol) == true_res
+        np_pred = y_pred.cpu().numpy()
+        np_true = np_pred * 0.65
+        true_res = ski_ssim(np_pred, np_true, win_size=7, channel_axis=1, gaussian_weights=False, data_range=1.0)
 
+        assert pytest.approx(res, abs=tol) == true_res
 
-@pytest.mark.parametrize("metric_device", [torch.device("cpu"), "process_device"])
-def test_distrib_accumulator_device(distributed, metric_device):
+    @pytest.mark.parametrize("metric_device", [torch.device("cpu"), "process_device"])
+    def test_accumulator_device(self, metric_device):
+        device = idist.device()
+        if metric_device == "process_device":
+            metric_device = torch.device(device if device.type != "xla" else "cpu")
 
-    device = idist.device()
-    if metric_device == "process_device":
-        metric_device = torch.device(device if device.type != "xla" else "cpu")
+        ssim = SSIM(data_range=1.0, device=metric_device)
 
-    ssim = SSIM(data_range=1.0, device=metric_device)
+        assert ssim._kernel is None
+        assert isinstance(ssim._kernel_2d, torch.Tensor)
 
-    for dev in [ssim._device, ssim._kernel.device]:
-        assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+        for dev in [ssim._device, ssim._kernel_2d.device]:
+            assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
 
-    y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device)
-    y = y_pred * 0.65
-    ssim.update((y_pred, y))
+        y_pred = torch.rand(2, 3, 28, 28, dtype=torch.float, device=device)
+        y = y_pred * 0.65
+        ssim.update((y_pred, y))
 
-    dev = ssim._sum_of_ssim.device
-    assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+        dev = ssim._sum_of_ssim.device
+        assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
index ed34994c6b73..bea4eba418b9 100644
--- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py
+++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py
@@ -105,12 +105,10 @@ def update(engine, i):
 
 
 def _test_distrib_accumulator_device(device):
-
     metric_devices = [torch.device("cpu")]
     if device.type != "xla":
         metric_devices.append(idist.device())
     for metric_device in metric_devices:
-
         acc = TopKCategoricalAccuracy(2, device=metric_device)
         assert acc._device == metric_device
         assert (
@@ -130,7 +128,6 @@ def _test_distrib_accumulator_device(device):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
 def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -139,7 +136,6 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl):
 @pytest.mark.distributed
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -149,7 +145,6 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo):
 @pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support")
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 def test_distrib_hvd(gloo_hvd_executor):
-
     device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
     nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()
 
@@ -161,7 +156,6 @@ def test_distrib_hvd(gloo_hvd_executor):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -171,7 +165,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo):
 @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
 @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
 def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
@@ -181,14 +174,12 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package")
 def test_distrib_single_device_xla():
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
 
 
 def _test_distrib_xla_nprocs(index):
-
     device = idist.device()
     _test_distrib_integration(device)
     _test_distrib_accumulator_device(device)
diff --git a/tests/ignite/test_utils.py b/tests/ignite/test_utils.py
index 93037553d054..c4c65a29d696 100644
--- a/tests/ignite/test_utils.py
+++ b/tests/ignite/test_utils.py
@@ -100,13 +100,11 @@ def forward(self, x):
 
 
 def test_dist_setup_logger():
-
     logger = setup_logger("trainer", level=logging.CRITICAL, distributed_rank=1)
     assert logger.level != logging.CRITICAL
 
 
 def test_setup_logger(capsys, dirname):
-
     trainer = Engine(lambda e, b: None)
     evaluator = Engine(lambda e, b: None)
 
@@ -118,7 +116,6 @@ def test_setup_logger(capsys, dirname):
     fp = dirname / "log"
 
     def _test(stream):
-
         trainer.logger = setup_logger("trainer", stream=stream, filepath=fp, reset=True)
         evaluator.logger = setup_logger("evaluator", stream=stream, filepath=fp, reset=True)
 
@@ -158,7 +155,6 @@ def _setup_a_logger_and_dump(name, message):
 
 
 def test_override_setup_logger(capsys):
-
     _setup_a_logger_and_dump(__name__, "test_override_setup_logger")
 
     source = capsys.readouterr().err.split("\n")
@@ -179,7 +175,6 @@ def test_override_setup_logger(capsys):
 
 
 def test_deprecated():
-
     # Test on function without docs, @deprecated without reasons
     @deprecated("0.4.2", "0.6.0")
     def func_no_docs():
diff --git a/tests/run_code_style.bat b/tests/run_code_style.bat
index 8f54943f1c8e..f8ebab0d0a38 100644
--- a/tests/run_code_style.bat
+++ b/tests/run_code_style.bat
@@ -20,7 +20,7 @@ mypy --config-file mypy.ini
 goto end
 
 :install
-pip install --upgrade flake8 "black==21.12b0" "usort==1.0.5" "ufmt==2.0.1" "mypy"
+pip install --upgrade flake8 "black==23.9.1" "usort==1.0.7" "ufmt==2.2.0" "mypy"
 goto end
 
 :end
diff --git a/tests/run_code_style.sh b/tests/run_code_style.sh
index cd28d9c751be..7f6f06546a05 100755
--- a/tests/run_code_style.sh
+++ b/tests/run_code_style.sh
@@ -10,5 +10,5 @@ elif [ $1 = "fmt" ]; then
 elif [ $1 = "mypy" ]; then
     mypy --config-file mypy.ini
 elif [ $1 = "install" ]; then
-    pip install --upgrade flake8 "black==21.12b0" "usort==1.0.5" "ufmt==2.0.1" "mypy"
+    pip install --upgrade flake8 "black==23.9.1" "usort==1.0.7" "ufmt==2.2.0" "mypy"
 fi
diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
index 2e4b57c9628c..2297be94219d 100644
--- a/tests/run_cpu_tests.sh
+++ b/tests/run_cpu_tests.sh
@@ -18,5 +18,5 @@ if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
 fi
 
 export WORLD_SIZE=2
-CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv
+CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
 unset WORLD_SIZE
diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh
index 6a51995cba34..3146443a531d 100644
--- a/tests/run_gpu_tests.sh
+++ b/tests/run_gpu_tests.sh
@@ -6,27 +6,30 @@ else
     ngpus=$1
 fi
 
-pattern=""
-if [ -n "$2" ]; then
-    pattern="-k $2"
+MATCH_TESTS_EXPRESSION=${2:-""}
+
+if [ -z "$MATCH_TESTS_EXPRESSION" ]; then
+    cuda_pattern="cuda"
+else
+    cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION"
 fi
 
 set -xeu
 
-pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k 'cuda'
+pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern"
 
 # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
 if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
     exit 0
 fi
 
-pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed ${pattern}
+pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION"
 
 
 if [ ${ngpus} -gt 1 ]; then
 
     export WORLD_SIZE=${ngpus}
-    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv
+    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
     unset WORLD_SIZE
 
 fi