diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index aadccaafb334..54db85d7af13 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -56,7 +56,7 @@ jobs:
     env:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.4xlarge
+    runs-on: amz2023.linux.4xlarge
     steps:
       - name: Clean workspace
         run: |
@@ -116,7 +116,7 @@ jobs:
     env:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.12xlarge
+    runs-on: amz2023.linux.12xlarge
     steps:
       - name: Clean workspace
         run: |
@@ -176,7 +176,7 @@ jobs:
     env:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.4xlarge
+    runs-on: amz2023.linux.4xlarge
     steps:
       - name: Clean workspace
         run: |
@@ -236,7 +236,7 @@ jobs:
     env:
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.12xlarge
+    runs-on: amz2023.linux.12xlarge
     steps:
       - name: Clean workspace
         run: |
diff --git a/.github/workflows/gpu-hvd-tests.yml b/.github/workflows/gpu-hvd-tests.yml
index 6661f46b501b..3f0dd5ad2c68 100644
--- a/.github/workflows/gpu-hvd-tests.yml
+++ b/.github/workflows/gpu-hvd-tests.yml
@@ -22,13 +22,13 @@ jobs:
   gpu-hvd-tests:
     strategy:
       matrix:
-        pytorch-channel: [pytorch, ]
+        pytorch-channel: [pytorch]
       fail-fast: false
     env:
       DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.8xlarge.nvidia.gpu
+    runs-on: amz2023.linux.8xlarge.nvidia.gpu
     timeout-minutes: 60
 
     steps:
@@ -128,8 +128,8 @@ jobs:
           # Can't build Horovod with recent pytorch due to pytorch required C++17 standard
           # and horovod is still using C++14
           # HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 pip install horovod[pytorch]
-          # Using a similar hack as described here: 
-          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345 
+          # Using a similar hack as described here:
+          # https://github.com/horovod/horovod/issues/3941#issuecomment-1732505345
           git clone --recursive https://github.com/horovod/horovod.git /horovod
           cd /horovod
           sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
@@ -152,7 +152,7 @@ jobs:
           set -xe
 
           bash tests/run_gpu_tests.sh 2 hvd
-          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd
+          CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ignite -m distributed -k hvd
 
           EOF
           )
diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index 92345b3baed3..094f20ed2257 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -28,8 +28,8 @@ jobs:
       DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
       REPOSITORY: ${{ github.repository }}
       PR_NUMBER: ${{ github.event.pull_request.number }}
-    runs-on: linux.8xlarge.nvidia.gpu
-    timeout-minutes: 45
+    runs-on: amz2023.linux.8xlarge.nvidia.gpu
+    timeout-minutes: 85
 
     steps:
       - name: Clean workspace
@@ -121,18 +121,13 @@ jobs:
 
       - name: Run GPU Unit Tests
         continue-on-error: false
-        run: |
-
-          script=$(cat << EOF
-
-          set -xe
-
-          bash tests/run_gpu_tests.sh 2
-
-          EOF
-          )
-
-          docker exec -t pthd /bin/bash -c "${script}"
+        uses: nick-fields/retry@v2.9.0
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
+          new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml
index f483d21f38ee..35e107f888b7 100644
--- a/.github/workflows/hvd-tests.yml
+++ b/.github/workflows/hvd-tests.yml
@@ -75,9 +75,13 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml
index e47f8faaa463..f268669158e9 100644
--- a/.github/workflows/pytorch-version-tests.yml
+++ b/.github/workflows/pytorch-version-tests.yml
@@ -10,15 +10,15 @@ on:
 jobs:
   build:
     runs-on: ubuntu-latest
-    timeout-minutes: 45
+    timeout-minutes: 85
     strategy:
       max-parallel: 5
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9, "3.10"]
         pytorch-version:
-          [2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.5.1]
-        exclude:            
+          [2.3.1, 2.2.2, 2.1.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0, 1.8.1, 1.5.1]
+        exclude:
           - pytorch-version: 1.5.1
             python-version: 3.9
           - pytorch-version: 1.5.1
@@ -31,9 +31,6 @@ jobs:
           - pytorch-version: 1.8.1
             python-version: "3.10"
 
-          - pytorch-version: 1.9.1
-            python-version: "3.10"
-
           - pytorch-version: 1.10.0
             python-version: "3.10"
 
@@ -78,7 +75,7 @@ jobs:
           pip install -r requirements-dev.txt
           python setup.py install
 
-          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern 
+          # pytorch>=1.9.0,<1.11.0 is using "from setuptools import distutils; distutils.version.LooseVersion" anti-pattern
           # which raises the error: AttributeError: module 'distutils' has no attribute 'version' for setuptools>59
           bad_pth_version=$(python -c "import torch; print('.'.join(torch.__version__.split('.')[:2]) in ['1.9', '1.10'])")
           if [ "${bad_pth_version}" == "True" ]; then
@@ -92,20 +89,24 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        shell: bash -l {0}
-        run: |
-          bash tests/run_cpu_tests.sh "not test_time_profilers"
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: bash -l tests/run_cpu_tests.sh "not test_time_profilers"
+          new_command_on_retry: USE_LAST_FAILED=1 bash -l tests/run_cpu_tests.sh "not test_time_profilers"
 
-  # create-issue:
-  #   runs-on: ubuntu-latest
-  #   # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#needs-context
-  #   needs: build
-  #   if: always() && needs.build.result == 'failure'
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - uses: JasonEtco/create-an-issue@v2
-  #       name: Create issue if pytorch version tests failed
-  #       with:
-  #         filename: .github/failed_schedule_issue_template.md
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  create-issue:
+    runs-on: ubuntu-latest
+    # https://docs.github.com/en/actions/reference/context-and-expression-syntax-for-github-actions#needs-context
+    needs: build
+    if: always() && needs.build.result == 'failure'
+    steps:
+      - uses: actions/checkout@v4
+      - uses: JasonEtco/create-an-issue@v2
+        name: Create issue if pytorch version tests failed
+        with:
+          filename: .github/failed_schedule_issue_template.md
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml
index 08eaaf30d8f7..cc330de0c279 100644
--- a/.github/workflows/tpu-tests.yml
+++ b/.github/workflows/tpu-tests.yml
@@ -36,10 +36,10 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
-      - name: Set up Python 3.9
+      - name: Set up Python 3.10
         uses: actions/setup-python@v4
         with:
-          python-version: "3.9"
+          python-version: "3.10"
           architecture: "x64"
 
       - name: Get year & week number
@@ -50,7 +50,7 @@ jobs:
       - name: Get pip cache dir
         id: pip-cache
         run: |
-          pip3 install -U pip
+          pip3 install -U "pip<24"
           echo "pip_cache=$(pip cache dir)" >> $GITHUB_OUTPUT
         shell: bash -l {0}
 
@@ -70,10 +70,9 @@ jobs:
           pip install mkl==2021.4.0
 
           ## Install torch & xla and torchvision
-          pip install --pre  https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch-nightly-cp39-cp39-linux_x86_64.whl
-          pip install --pre  https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-nightly-cp39-cp39-linux_x86_64.whl
-          pip install --pre  https://storage.googleapis.com/tpu-pytorch/wheels/colab/torchvision-nightly-cp39-cp39-linux_x86_64.whl
-
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly-cp310-cp310-linux_x86_64.whl
+          
           # Check installation
           python -c "import torch"
 
@@ -89,13 +88,19 @@ jobs:
           target_dir: /tmp
 
       - name: Run Tests
-        run: |
-          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${Python_ROOT_DIR}/lib
-          export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
-          export XRT_WORKERS="localservice:0;grpc://localhost:40934"
-
-          python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
-          bash tests/run_tpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 25
+          shell: bash
+          command: |
+            python -c "import torch_xla; print('torch xla version:', torch_xla.__version__)"
+            bash tests/run_tpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_tpu_tests.sh
+        env:
+          LD_LIBRARY_PATH: ${{ env.LD_LIBRARY_PATH }}:${{ env.Python_ROOT_DIR }}/lib
+          XRT_DEVICE_MAP: "CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0"
+          XRT_WORKERS: "localservice:0;grpc://localhost:40934"
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index a4b697255699..0b94e0d0e9e1 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -31,7 +31,7 @@ concurrency:
 jobs:
   cpu-tests:
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 45
+    timeout-minutes: 85
     defaults:
       run:
         shell: bash
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11","3.12"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
         pytorch-channel: [pytorch, pytorch-nightly]
         include:
           # includes a single build on windows
@@ -102,7 +102,7 @@ jobs:
 
       - name: Run Mypy
         # https://github.com/pytorch/ignite/pull/2780
-        # 
+        #
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.pytorch-channel == 'pytorch-nightly'}}
         run: |
           bash ./tests/run_code_style.sh mypy
@@ -120,8 +120,13 @@ jobs:
           cp -R /tmp/MNIST .
 
       - name: Run Tests
-        run: |
-          SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 5
+          timeout_minutes: 15
+          shell: bash
+          command: SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
+          new_command_on_retry: USE_LAST_FAILED=1 SKIP_DISTRIB_TESTS=${{ matrix.skip-distrib-tests }} bash tests/run_cpu_tests.sh
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
diff --git a/docker/docker.cfg b/docker/docker.cfg
index 5c48d9f83e3f..6b9769333471 100644
--- a/docker/docker.cfg
+++ b/docker/docker.cfg
@@ -1,4 +1,4 @@
 [DEFAULT]
-build_docker_image_pytorch_version = 2.2.2-cuda12.1-cudnn8
+build_docker_image_pytorch_version = 2.4.0-cuda12.4-cudnn9
 build_docker_image_hvd_version = v0.28.1
 build_docker_image_msdp_version = v0.14.0
diff --git a/docker/hvd/Dockerfile.hvd-apex-nlp b/docker/hvd/Dockerfile.hvd-apex-nlp
index 6379490c4966..a8f51988baa7 100644
--- a/docker/hvd/Dockerfile.hvd-apex-nlp
+++ b/docker/hvd/Dockerfile.hvd-apex-nlp
@@ -4,5 +4,4 @@ FROM pytorchignite/hvd-apex:latest
 # Ignite NLP dependencies
 RUN pip install --upgrade --no-cache-dir transformers \
                                          spacy \
-                                         nltk \
-                                         torchtext
+                                         nltk
diff --git a/docker/hvd/Dockerfile.hvd-nlp b/docker/hvd/Dockerfile.hvd-nlp
index db4ca4c3ebc1..84da0230b9e9 100644
--- a/docker/hvd/Dockerfile.hvd-nlp
+++ b/docker/hvd/Dockerfile.hvd-nlp
@@ -4,5 +4,4 @@ FROM pytorchignite/hvd-base:latest
 # Ignite NLP dependencies
 RUN pip install --upgrade --no-cache-dir transformers \
                                          spacy \
-                                         nltk \
-                                         torchtext
+                                         nltk
diff --git a/docker/main/Dockerfile.apex-nlp b/docker/main/Dockerfile.apex-nlp
index ad7507df777c..b9be5acd6d9f 100644
--- a/docker/main/Dockerfile.apex-nlp
+++ b/docker/main/Dockerfile.apex-nlp
@@ -4,5 +4,4 @@ FROM pytorchignite/apex:latest
 # Ignite NLP dependencies
 RUN pip install --upgrade --no-cache-dir transformers \
                                          spacy \
-                                         nltk \
-                                         torchtext
+                                         nltk
diff --git a/docker/main/Dockerfile.nlp b/docker/main/Dockerfile.nlp
index e5ef45cfef0f..7826caa03ef8 100644
--- a/docker/main/Dockerfile.nlp
+++ b/docker/main/Dockerfile.nlp
@@ -4,5 +4,4 @@ FROM pytorchignite/base:latest
 # Ignite NLP dependencies
 RUN pip install --upgrade --no-cache-dir transformers \
                                          spacy \
-                                         nltk \
-                                         torchtext
+                                         nltk
\ No newline at end of file
diff --git a/docker/test_image.py b/docker/test_image.py
index 0bfdc6c4697b..ebb706c303fb 100644
--- a/docker/test_image.py
+++ b/docker/test_image.py
@@ -54,7 +54,6 @@ def check_package(package_name, expected_version=None):
         check_package("cv2")
 
     if "nlp" in image_type:
-        check_package("torchtext")
         check_package("transformers")
 
     if "apex" in image_type:
diff --git a/docs/Makefile b/docs/Makefile
index 3d1f9ada6a8b..413cdff94ad5 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -22,6 +22,13 @@ docset: html
 rebuild:
 	rm -rf source/generated && make clean && make html
 
+clean:
+	@echo "Cleaning up..."
+	python -c "import shutil; shutil.rmtree('$(BUILDDIR)', ignore_errors=True)"
+	python -c "import shutil; shutil.rmtree('$(SOURCEDIR)/generated', ignore_errors=True)"
+	python -c "import os; [os.remove(f) for f in os.listdir('.') if f.endswith('.pyc')]"
+	python -c "import shutil; import os; [shutil.rmtree(f) for f in os.listdir('.') if f == '__pycache__' and os.path.isdir(f)]"
+
 .PHONY: help Makefile docset
 
 # Catch-all target: route all unknown targets to Sphinx using the new
diff --git a/docs/make.bat b/docs/make.bat
index 3bf02ee70066..e552da3d300b 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,36 +1,36 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-set SPHINXPROJ=ignite
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+set SPHINXPROJ=ignite
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2256d425becf..e26a50785f2c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -301,7 +301,15 @@ def run(self):
                         names = [name[0] for name in getmembers(module)]
 
                 # Filter out members w/o doc strings
-                names = [name for name in names if getattr(module, name).__doc__ is not None]
+                filtered_names = []
+                for name in names:
+                    try:
+                        if not name.startswith("_") and getattr(module, name).__doc__ is not None:
+                            filtered_names.append(name)
+                    except AttributeError:
+                        continue
+
+                names = filtered_names
 
                 if auto == "autolist":
                     # Get list of all classes and functions inside module
@@ -350,6 +358,7 @@ def run(self):
     "https://github.com/fossasia/visdom#visdom-arguments-python-only",
     "https://github.com/pytorch/ignite/tree/master/examples/cifar10#check-resume-training",
     "https://github.com/pytorch/ignite/tree/master/examples/mnist#training-save--resume",
+    "https://machinelearningmastery.com/gentle-introduction-backpropagation-time/",
 ]
 
 
diff --git a/docs/source/contrib/handlers.rst b/docs/source/contrib/handlers.rst
index 214f23a843ba..3f1af41856bd 100644
--- a/docs/source/contrib/handlers.rst
+++ b/docs/source/contrib/handlers.rst
@@ -28,5 +28,5 @@ Time profilers [deprecated]
 Loggers [deprecated]
 --------------------
 
-.. deprecated:: 0.5.1
+.. deprecated:: 0.5.0
     Loggers moved to :ref:`Loggers`.
diff --git a/docs/source/contrib/metrics.rst b/docs/source/contrib/metrics.rst
index fdaa0432f5cb..3c5603e6ec5f 100644
--- a/docs/source/contrib/metrics.rst
+++ b/docs/source/contrib/metrics.rst
@@ -4,12 +4,12 @@ ignite.contrib.metrics
 Contrib module metrics [deprecated]
 -----------------------------------
 
-.. deprecated:: 0.5.1
+.. deprecated:: 0.5.0
     All metrics moved to :ref:`Complete list of metrics`.
 
 
 Regression metrics [deprecated]
 --------------------------------
 
-.. deprecated:: 0.5.1
+.. deprecated:: 0.5.0
     All metrics moved to :ref:`Complete list of metrics`.
diff --git a/docs/source/metrics.rst b/docs/source/metrics.rst
index 0696cc3070ae..0e4979f82a1c 100644
--- a/docs/source/metrics.rst
+++ b/docs/source/metrics.rst
@@ -335,8 +335,10 @@ Complete list of metrics
     MeanPairwiseDistance
     MeanSquaredError
     metric.Metric
+    metric_group.MetricGroup
     metrics_lambda.MetricsLambda
     MultiLabelConfusionMatrix
+    MutualInformation
     precision.Precision
     PSNR
     recall.Recall
@@ -352,6 +354,9 @@ Complete list of metrics
     FID
     CosineSimilarity
     Entropy
+    KLDivergence
+    JSDivergence
+    MaximumMeanDiscrepancy
     AveragePrecision
     CohenKappa
     GpuInfo
diff --git a/ignite/__init__.py b/ignite/__init__.py
index d804675afa5b..3c926af9124d 100644
--- a/ignite/__init__.py
+++ b/ignite/__init__.py
@@ -6,4 +6,4 @@
 import ignite.metrics
 import ignite.utils
 
-__version__ = "0.6.0"
+__version__ = "0.5.1"
diff --git a/ignite/distributed/comp_models/base.py b/ignite/distributed/comp_models/base.py
index 6e86193381c7..6d2d7d819fa1 100644
--- a/ignite/distributed/comp_models/base.py
+++ b/ignite/distributed/comp_models/base.py
@@ -5,7 +5,7 @@
 import torch
 from packaging.version import Version
 
-_torch_version_le_112 = Version(torch.__version__) > Version("1.12.0")
+_torch_version_gt_112 = Version(torch.__version__) > Version("1.12.0")
 
 
 class ComputationModel(metaclass=ABCMeta):
@@ -329,7 +329,7 @@ def get_node_rank(self) -> int:
     def device(self) -> torch.device:
         if torch.cuda.is_available():
             return torch.device("cuda")
-        if _torch_version_le_112 and torch.backends.mps.is_available():
+        if _torch_version_gt_112 and torch.backends.mps.is_available():
             return torch.device("mps")
         return torch.device("cpu")
 
diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py
index 865be7e7800d..cbaac4e16cb7 100644
--- a/ignite/engine/__init__.py
+++ b/ignite/engine/__init__.py
@@ -96,7 +96,7 @@ def supervised_training_step(
         Added `model_transform` to transform model's output
     .. versionchanged:: 0.4.13
         Added `model_fn` to customize model's application on the sample
-    .. versionchanged:: 0.5.1
+    .. versionchanged:: 0.5.0
         Added support for ``mps`` device
     """
 
@@ -551,7 +551,7 @@ def output_transform_fn(x, y, y_pred, loss):
         Added ``model_transform`` to transform model's output
     .. versionchanged:: 0.4.13
         Added `model_fn` to customize model's application on the sample
-    .. versionchanged:: 0.5.1
+    .. versionchanged:: 0.5.0
         Added support for ``mps`` device
     """
 
@@ -799,7 +799,7 @@ def create_supervised_evaluator(
         Added ``model_transform`` to transform model's output
     .. versionchanged:: 0.4.13
         Added `model_fn` to customize model's application on the sample
-    .. versionchanged:: 0.5.1
+    .. versionchanged:: 0.5.0
         Added support for ``mps`` device
     """
     device_type = device.type if isinstance(device, torch.device) else device
diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py
index 865218af3599..24e7f885ec8d 100644
--- a/ignite/engine/engine.py
+++ b/ignite/engine/engine.py
@@ -1,6 +1,5 @@
 import functools
 import logging
-import math
 import time
 import warnings
 import weakref
@@ -157,7 +156,7 @@ def __init__(self, process_function: Callable[["Engine", Any], Any]):
         _check_signature(process_function, "process_function", self, None)
 
         # generator provided by self._internal_run_as_gen
-        self._internal_run_generator: Optional[Generator] = None
+        self._internal_run_generator: Optional[Generator[Any, None, State]] = None
 
     def register_events(
         self, *event_names: Union[List[str], List[EventEnum]], event_to_attr: Optional[dict] = None
@@ -731,14 +730,13 @@ def load_state_dict(self, state_dict: Mapping) -> None:
 
     @staticmethod
     def _is_done(state: State) -> bool:
-        is_done_iters = state.max_iters is not None and state.iteration >= state.max_iters
         is_done_count = (
             state.epoch_length is not None
             and state.max_epochs is not None
             and state.iteration >= state.epoch_length * state.max_epochs
         )
         is_done_epochs = state.max_epochs is not None and state.epoch >= state.max_epochs
-        return is_done_iters or is_done_count or is_done_epochs
+        return is_done_count or is_done_epochs
 
     def set_data(self, data: Union[Iterable, DataLoader]) -> None:
         """Method to set data. After calling the method the next batch passed to `processing_function` is
@@ -780,14 +778,13 @@ def run(
         self,
         data: Optional[Iterable] = None,
         max_epochs: Optional[int] = None,
-        max_iters: Optional[int] = None,
         epoch_length: Optional[int] = None,
     ) -> State:
         """Runs the ``process_function`` over the passed data.
 
         Engine has a state and the following logic is applied in this function:
 
-        - At the first call, new state is defined by `max_epochs`, `max_iters`, `epoch_length`, if provided.
+        - At the first call, new state is defined by `max_epochs`, `epoch_length`, if provided.
           A timer for total and per-epoch time is initialized when Events.STARTED is handled.
         - If state is already defined such that there are iterations to run until `max_epochs` and no input arguments
           provided, state is kept and used in the function.
@@ -805,9 +802,6 @@ def run(
                 `len(data)`. If `data` is an iterator and `epoch_length` is not set, then it will be automatically
                 determined as the iteration on which data iterator raises `StopIteration`.
                 This argument should not change if run is resuming from a state.
-            max_iters: Number of iterations to run for.
-                `max_iters` and `max_epochs` are mutually exclusive; only one of the two arguments should be provided.
-
         Returns:
             State: output state.
 
@@ -858,6 +852,8 @@ def switch_batch(engine):
 
         if self.state.max_epochs is None or (self._is_done(self.state) and self._internal_run_generator is None):
             # Create new state
+            if max_epochs is None:
+                max_epochs = 1
             if epoch_length is None:
                 if data is None:
                     raise ValueError("epoch_length should be provided if data is None")
@@ -866,22 +862,9 @@ def switch_batch(engine):
                 if epoch_length is not None and epoch_length < 1:
                     raise ValueError("Input data has zero size. Please provide non-empty data")
 
-            if max_iters is None:
-                if max_epochs is None:
-                    max_epochs = 1
-            else:
-                if max_epochs is not None:
-                    raise ValueError(
-                        "Arguments max_iters and max_epochs are mutually exclusive."
-                        "Please provide only max_epochs or max_iters."
-                    )
-                if epoch_length is not None:
-                    max_epochs = math.ceil(max_iters / epoch_length)
-
             self.state.iteration = 0
             self.state.epoch = 0
             self.state.max_epochs = max_epochs
-            self.state.max_iters = max_iters
             self.state.epoch_length = epoch_length
             # Reset generator if previously used
             self._internal_run_generator = None
@@ -951,7 +934,7 @@ def _internal_run(self) -> State:
             self._internal_run_generator = None
             return out.value
 
-    def _internal_run_as_gen(self) -> Generator:
+    def _internal_run_as_gen(self) -> Generator[Any, None, State]:
         self.should_terminate = self.should_terminate_single_epoch = self.should_interrupt = False
         self._init_timers(self.state)
         try:
@@ -1062,18 +1045,12 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]:
                     if self.state.epoch_length is None:
                         # Define epoch length and stop the epoch
                         self.state.epoch_length = iter_counter
-                        if self.state.max_iters is not None:
-                            self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length)
                         break
 
                     # Should exit while loop if we can not iterate
                     if should_exit:
-                        if not self._is_done(self.state):
-                            total_iters = (
-                                self.state.epoch_length * self.state.max_epochs
-                                if self.state.max_epochs is not None
-                                else self.state.max_iters
-                            )
+                        if not self._is_done(self.state) and self.state.max_epochs is not None:
+                            total_iters = self.state.epoch_length * self.state.max_epochs
 
                             warnings.warn(
                                 "Data iterator can not provide data anymore but required total number of "
@@ -1104,10 +1081,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]:
                 if self.state.epoch_length is not None and iter_counter == self.state.epoch_length:
                     break
 
-                if self.state.max_iters is not None and self.state.iteration == self.state.max_iters:
-                    self.should_terminate = True
-                    raise _EngineTerminateException()
-
         except _EngineTerminateSingleEpochException:
             self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter)
             self.should_terminate_single_epoch = False
@@ -1229,18 +1202,12 @@ def _run_once_on_dataset_legacy(self) -> float:
                     if self.state.epoch_length is None:
                         # Define epoch length and stop the epoch
                         self.state.epoch_length = iter_counter
-                        if self.state.max_iters is not None:
-                            self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length)
                         break
 
                     # Should exit while loop if we can not iterate
                     if should_exit:
-                        if not self._is_done(self.state):
-                            total_iters = (
-                                self.state.epoch_length * self.state.max_epochs
-                                if self.state.max_epochs is not None
-                                else self.state.max_iters
-                            )
+                        if not self._is_done(self.state) and self.state.max_epochs is not None:
+                            total_iters = self.state.epoch_length * self.state.max_epochs
 
                             warnings.warn(
                                 "Data iterator can not provide data anymore but required total number of "
@@ -1271,10 +1238,6 @@ def _run_once_on_dataset_legacy(self) -> float:
                 if self.state.epoch_length is not None and iter_counter == self.state.epoch_length:
                     break
 
-                if self.state.max_iters is not None and self.state.iteration == self.state.max_iters:
-                    self.should_terminate = True
-                    raise _EngineTerminateException()
-
         except _EngineTerminateSingleEpochException:
             self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter)
             self.should_terminate_single_epoch = False
diff --git a/ignite/engine/events.py b/ignite/engine/events.py
index 9dd99348492b..aebffdfe058a 100644
--- a/ignite/engine/events.py
+++ b/ignite/engine/events.py
@@ -443,7 +443,6 @@ class State:
         state.dataloader        # data passed to engine
         state.epoch_length      # optional length of an epoch
         state.max_epochs        # number of epochs to run
-        state.max_iters         # number of iterations to run
         state.batch             # batch passed to `process_function`
         state.output            # output of `process_function` after a single iteration
         state.metrics           # dictionary with defined metrics if any
@@ -470,7 +469,6 @@ def __init__(self, **kwargs: Any) -> None:
         self.epoch = 0
         self.epoch_length: Optional[int] = None
         self.max_epochs: Optional[int] = None
-        self.max_iters: Optional[int] = None
         self.output: Optional[int] = None
         self.batch: Optional[int] = None
         self.metrics: Dict[str, Any] = {}
diff --git a/ignite/handlers/fbresearch_logger.py b/ignite/handlers/fbresearch_logger.py
index a291138e48d5..4243a636b6fb 100644
--- a/ignite/handlers/fbresearch_logger.py
+++ b/ignite/handlers/fbresearch_logger.py
@@ -1,18 +1,18 @@
 """FBResearch logger and its helper handlers."""
 
 import datetime
-from typing import Any, Optional
-
-# from typing import Any, Dict, Optional, Union
+from typing import Any, Callable, List, Optional
 
 import torch
 
+from ignite import utils
 from ignite.engine import Engine, Events
 from ignite.handlers import Timer
 
-
 MB = 1024.0 * 1024.0
 
+__all__ = ["FBResearchLogger"]
+
 
 class FBResearchLogger:
     """Logs training and validation metrics for research purposes.
@@ -30,10 +30,64 @@ class FBResearchLogger:
         .. code-block:: python
 
             import logging
-            from ignite.handlers.fbresearch_logger import *
 
-            logger = FBResearchLogger(logger=logging.Logger(__name__), show_output=True)
-            logger.attach(trainer, name="Train", every=10, optimizer=my_optimizer)
+            import torch
+            import torch.nn as nn
+            import torch.optim as optim
+
+            from ignite.engine import create_supervised_trainer, Events
+            from ignite.handlers.fbresearch_logger import FBResearchLogger
+            from ignite.utils import setup_logger
+
+            model = nn.Linear(10, 5)
+            opt = optim.SGD(model.parameters(), lr=0.001)
+            criterion = nn.CrossEntropyLoss()
+
+            data = [(torch.rand(4, 10), torch.randint(0, 5, size=(4, ))) for _ in range(100)]
+
+            trainer = create_supervised_trainer(
+                model, opt, criterion, output_transform=lambda x, y, y_pred, loss: {"total_loss": loss.item()}
+            )
+
+            logger = setup_logger("trainer", level=logging.INFO)
+            logger = FBResearchLogger(logger=logger, show_output=True)
+            logger.attach(trainer, name="Train", every=20, optimizer=opt)
+
+            trainer.run(data, max_epochs=4)
+
+        Output:
+
+        .. code-block:: text
+
+            2024-04-22 12:05:47,843 trainer INFO: Train: start epoch [1/4]
+            ... Epoch [1/4]  [20/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.5999  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [1/4]  [40/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9297  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [1/4]  [60/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9985  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [1/4]  [80/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9785  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [1/4]  [100/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.6211  Iter time: 0.0008 s  Data prep .
+            ... Train: Epoch [1/4]  Total time: 0:00:00  (0.0008 s / it)
+            ... Train: start epoch [2/4]
+            ... Epoch [2/4]  [19/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.5981  Iter time: 0.0009 s  Data prep ..
+            ... Epoch [2/4]  [39/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9013  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [2/4]  [59/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9811  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [2/4]  [79/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9434  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [2/4]  [99/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.6116  Iter time: 0.0008 s  Data prep ..
+            ... Train: Epoch [2/4]  Total time: 0:00:00  (0.0009 s / it)
+            ... Train: start epoch [3/4]
+            ... Epoch [3/4]  [18/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.5972  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [3/4]  [38/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.8753  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [3/4]  [58/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9657  Iter time: 0.0009 s  Data prep ..
+            ... Epoch [3/4]  [78/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9112  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [3/4]  [98/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.6035  Iter time: 0.0008 s  Data prep ..
+            ... Train: Epoch [3/4]  Total time: 0:00:00  (0.0009 s / it)
+            ... Train: start epoch [4/4]
+            ... Epoch [4/4]  [17/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.5969  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [4/4]  [37/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.8516  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [4/4]  [57/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.9521  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [4/4]  [77/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.8816  Iter time: 0.0008 s  Data prep ..
+            ... Epoch [4/4]  [97/100]:  ETA: 0:00:00  lr: 0.00100  total_loss: 1.5966  Iter time: 0.0009 s  Data prep ..
+            ... Train: Epoch [4/4]  Total time: 0:00:00  (0.0009 s / it)
+            ... Train: run completed  Total time: 0:00:00
     """
 
     def __init__(self, logger: Any, delimiter: str = "  ", show_output: bool = False):
@@ -44,7 +98,13 @@ def __init__(self, logger: Any, delimiter: str = "  ", show_output: bool = False
         self.show_output: bool = show_output
 
     def attach(
-        self, engine: Engine, name: str, every: int = 1, optimizer: Optional[torch.optim.Optimizer] = None
+        self,
+        engine: Engine,
+        name: str,
+        every: int = 1,
+        output_transform: Optional[Callable] = None,
+        state_attributes: Optional[List[str]] = None,
+        optimizer: Optional[torch.optim.Optimizer] = None,
     ) -> None:
         """Attaches all the logging handlers to the given engine.
 
@@ -52,8 +112,13 @@ def attach(
             engine: The engine to attach the logging handlers to.
             name: The name of the engine (e.g., "Train", "Validate") to include in log messages.
             every: Frequency of iterations to log information. Logs are generated every 'every' iterations.
+            output_transform: A function to select the value to log.
+            state_attributes: A list of attributes to log.
             optimizer: The optimizer used during training to log current learning rates.
         """
+        self.name = name
+        self.output_transform = output_transform
+        self.state_attributes = state_attributes
         engine.add_event_handler(Events.EPOCH_STARTED, self.log_epoch_started, engine, name)
         engine.add_event_handler(Events.ITERATION_COMPLETED(every=every), self.log_every, engine, optimizer=optimizer)
         engine.add_event_handler(Events.EPOCH_COMPLETED, self.log_epoch_completed, engine, name)
@@ -97,10 +162,9 @@ def log_every(self, engine: Engine, optimizer: Optional[torch.optim.Optimizer] =
         outputs = []
         if self.show_output and engine.state.output is not None:
             output = engine.state.output
-            if isinstance(output, dict):
-                outputs += [f"{k}: {v:.4f}" for k, v in output.items()]
-            else:
-                outputs += [f"{v:.4f}" if isinstance(v, float) else f"{v}" for v in output]  # type: ignore
+            if self.output_transform is not None:
+                output = self.output_transform(output)
+            outputs = utils._to_str_list(output)
 
         lrs = ""
         if optimizer is not None:
@@ -110,6 +174,11 @@ def log_every(self, engine: Engine, optimizer: Optional[torch.optim.Optimizer] =
                 for i, g in enumerate(optimizer.param_groups):
                     lrs += f"lr [g{i}]: {g['lr']:.5f}"
 
+        state_attrs = []
+        if self.state_attributes is not None:
+            state_attrs = utils._to_str_list(
+                {name: getattr(engine.state, name, None) for name in self.state_attributes}
+            )
         msg = self.delimiter.join(
             [
                 f"Epoch [{engine.state.epoch}/{engine.state.max_epochs}]",
@@ -118,6 +187,7 @@ def log_every(self, engine: Engine, optimizer: Optional[torch.optim.Optimizer] =
                 f"{lrs}",
             ]
             + outputs
+            + [" ".join(state_attrs)]
             + [
                 f"Iter time: {iter_avg_time:.4f} s",
                 f"Data prep time: {self.data_timer.value():.4f} s",
diff --git a/ignite/handlers/lr_finder.py b/ignite/handlers/lr_finder.py
index 2b3e58c05aeb..3643709a1b61 100644
--- a/ignite/handlers/lr_finder.py
+++ b/ignite/handlers/lr_finder.py
@@ -105,7 +105,6 @@ def _run(
             max_iter = trainer.state.epoch_length * trainer.state.max_epochs  # type: ignore[operator]
             if max_iter < num_iter:
                 max_iter = num_iter
-                trainer.state.max_iters = num_iter
                 trainer.state.max_epochs = ceil(num_iter / trainer.state.epoch_length)  # type: ignore[operator]
 
         if not trainer.has_event_handler(self._reached_num_iterations):
@@ -542,7 +541,7 @@ def __init__(
         # override base_lrs
         self.base_lrs = start_lrs
 
-    def get_lr(self) -> List[float]:  # type: ignore[override]
+    def get_lr(self) -> List[float]:
         curr_iter = self.last_epoch + 1
         r = curr_iter / self.num_iter
         return [base_lr * (end_lr / base_lr) ** r for end_lr, base_lr in zip(self.end_lrs, self.base_lrs)]
diff --git a/ignite/handlers/param_scheduler.py b/ignite/handlers/param_scheduler.py
index d0d0cba4fd8f..dee9a4116b81 100644
--- a/ignite/handlers/param_scheduler.py
+++ b/ignite/handlers/param_scheduler.py
@@ -7,7 +7,7 @@
 from collections import OrderedDict
 from copy import copy
 from pathlib import Path
-from typing import Any, cast, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
 
 import torch
 from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
@@ -992,7 +992,7 @@ def get_param(self) -> Union[float, List[float]]:
         """Method to get current optimizer's parameter value"""
         # Emulate context manager for pytorch>=1.4
         self.lr_scheduler._get_lr_called_within_step = True  # type: ignore[union-attr]
-        lr_list = cast(List[float], self.lr_scheduler.get_lr())
+        lr_list = self.lr_scheduler.get_lr()
         self.lr_scheduler._get_lr_called_within_step = False  # type: ignore[union-attr]
         if len(lr_list) == 1:
             return lr_list[0]
@@ -1670,7 +1670,7 @@ def __init__(
             _scheduler_kwargs["verbose"] = False
 
         self.scheduler = ReduceLROnPlateau(optimizer, **_scheduler_kwargs)
-        self.scheduler._reduce_lr = self._reduce_lr  # type: ignore[attr-defined]
+        self.scheduler._reduce_lr = self._reduce_lr  # type: ignore[method-assign]
 
         self._state_attrs += ["metric_name", "scheduler"]
 
diff --git a/ignite/handlers/wandb_logger.py b/ignite/handlers/wandb_logger.py
index 3f8e44840c71..621ff9d3ebd6 100644
--- a/ignite/handlers/wandb_logger.py
+++ b/ignite/handlers/wandb_logger.py
@@ -134,6 +134,7 @@ def __init__(self, *args: Any, **kwargs: Any):
                 "You man install wandb with the command:\n pip install wandb\n"
             )
         if kwargs.get("init", True):
+            kwargs.pop("init", None)
             wandb.init(*args, **kwargs)
 
     def __getattr__(self, attr: Any) -> Any:
diff --git a/ignite/metrics/__init__.py b/ignite/metrics/__init__.py
index 1b23257d4aa0..142a13e5934e 100644
--- a/ignite/metrics/__init__.py
+++ b/ignite/metrics/__init__.py
@@ -14,13 +14,18 @@
 from ignite.metrics.gan.fid import FID
 from ignite.metrics.gan.inception_score import InceptionScore
 from ignite.metrics.gpu_info import GpuInfo
+from ignite.metrics.js_divergence import JSDivergence
+from ignite.metrics.kl_divergence import KLDivergence
 from ignite.metrics.loss import Loss
+from ignite.metrics.maximum_mean_discrepancy import MaximumMeanDiscrepancy
 from ignite.metrics.mean_absolute_error import MeanAbsoluteError
 from ignite.metrics.mean_pairwise_distance import MeanPairwiseDistance
 from ignite.metrics.mean_squared_error import MeanSquaredError
 from ignite.metrics.metric import BatchFiltered, BatchWise, EpochWise, Metric, MetricUsage
+from ignite.metrics.metric_group import MetricGroup
 from ignite.metrics.metrics_lambda import MetricsLambda
 from ignite.metrics.multilabel_confusion_matrix import MultiLabelConfusionMatrix
+from ignite.metrics.mutual_information import MutualInformation
 from ignite.metrics.nlp.bleu import Bleu
 from ignite.metrics.nlp.rouge import Rouge, RougeL, RougeN
 from ignite.metrics.precision import Precision
@@ -37,6 +42,7 @@
     "Metric",
     "Accuracy",
     "Loss",
+    "MetricGroup",
     "MetricsLambda",
     "MeanAbsoluteError",
     "MeanPairwiseDistance",
@@ -56,7 +62,11 @@
     "InceptionScore",
     "mIoU",
     "JaccardIndex",
+    "JSDivergence",
+    "KLDivergence",
+    "MaximumMeanDiscrepancy",
     "MultiLabelConfusionMatrix",
+    "MutualInformation",
     "Precision",
     "PSNR",
     "Recall",
diff --git a/ignite/metrics/accumulation.py b/ignite/metrics/accumulation.py
index 426b35a21abc..4457917c5beb 100644
--- a/ignite/metrics/accumulation.py
+++ b/ignite/metrics/accumulation.py
@@ -34,7 +34,12 @@ class VariableAccumulation(Metric):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     required_output_keys = None
@@ -45,13 +50,16 @@ def __init__(
         op: Callable,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         if not callable(op):
             raise TypeError(f"Argument op should be a callable, but given {type(op)}")
 
         self._op = op
 
-        super(VariableAccumulation, self).__init__(output_transform=output_transform, device=device)
+        super(VariableAccumulation, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
@@ -110,6 +118,9 @@ class Average(VariableAccumulation):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
 
@@ -164,17 +175,25 @@ class Average(VariableAccumulation):
         .. testoutput::
 
             tensor([1.5000, 1.5000, 1.5000], dtype=torch.float64)
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
+        self,
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         def _mean_op(a: Union[float, torch.Tensor], x: Union[float, torch.Tensor]) -> Union[float, torch.Tensor]:
             if isinstance(x, torch.Tensor) and x.ndim > 1:
                 x = x.sum(dim=0)
             return a + x
 
-        super(Average, self).__init__(op=_mean_op, output_transform=output_transform, device=device)
+        super(Average, self).__init__(
+            op=_mean_op, output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @sync_all_reduce("accumulator", "num_examples")
     def compute(self) -> Union[float, torch.Tensor]:
@@ -200,6 +219,9 @@ class GeometricAverage(VariableAccumulation):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
 
@@ -267,10 +289,16 @@ class GeometricAverage(VariableAccumulation):
         .. testoutput::
 
             tensor([2.2134, 2.2134, 2.2134], dtype=torch.float64)
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
+        self,
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         def _geom_op(a: torch.Tensor, x: Union[float, torch.Tensor]) -> torch.Tensor:
             if not isinstance(x, torch.Tensor):
@@ -280,7 +308,9 @@ def _geom_op(a: torch.Tensor, x: Union[float, torch.Tensor]) -> torch.Tensor:
                 x = x.sum(dim=0)
             return a + x
 
-        super(GeometricAverage, self).__init__(op=_geom_op, output_transform=output_transform, device=device)
+        super(GeometricAverage, self).__init__(
+            op=_geom_op, output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @sync_all_reduce("accumulator", "num_examples")
     def compute(self) -> Union[float, torch.Tensor]:
diff --git a/ignite/metrics/accuracy.py b/ignite/metrics/accuracy.py
index 0bfe62b85b7b..c205008e24c4 100644
--- a/ignite/metrics/accuracy.py
+++ b/ignite/metrics/accuracy.py
@@ -14,11 +14,14 @@ def __init__(
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         self._is_multilabel = is_multilabel
         self._type: Optional[str] = None
         self._num_classes: Optional[int] = None
-        super(_BaseClassification, self).__init__(output_transform=output_transform, device=device)
+        super(_BaseClassification, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     def reset(self) -> None:
         self._type = None
@@ -114,6 +117,9 @@ class Accuracy(_BaseClassification):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
 
@@ -206,6 +212,9 @@ def thresholded_output_transform(output):
         .. testoutput:: 4
 
             0.6666...
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_num_correct", "_num_examples")
@@ -215,8 +224,11 @@ def __init__(
         output_transform: Callable = lambda x: x,
         is_multilabel: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
-        super(Accuracy, self).__init__(output_transform=output_transform, is_multilabel=is_multilabel, device=device)
+        super(Accuracy, self).__init__(
+            output_transform=output_transform, is_multilabel=is_multilabel, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/average_precision.py b/ignite/metrics/average_precision.py
index e2dab8b09abc..03d53de4e29b 100644
--- a/ignite/metrics/average_precision.py
+++ b/ignite/metrics/average_precision.py
@@ -28,6 +28,9 @@ class AveragePrecision(EpochMetric):
             #sklearn.metrics.average_precision_score>`_ is run on the first batch of data to ensure there are
             no issues. User will be warned in case there are any issues computing the function.
         device: optional device specification for internal storage.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
         AveragePrecision expects y to be comprised of 0's and 1's. y_pred must either be probability estimates or
@@ -60,6 +63,8 @@ def activated_output_transform(output):
 
             0.9166...
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -67,6 +72,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         try:
             from sklearn.metrics import average_precision_score  # noqa: F401
@@ -78,4 +84,5 @@ def __init__(
             output_transform=output_transform,
             check_compute_fn=check_compute_fn,
             device=device,
+            skip_unrolling=skip_unrolling,
         )
diff --git a/ignite/metrics/cohen_kappa.py b/ignite/metrics/cohen_kappa.py
index 92d9b07aa4a6..15cb0222c250 100644
--- a/ignite/metrics/cohen_kappa.py
+++ b/ignite/metrics/cohen_kappa.py
@@ -23,6 +23,9 @@ class CohenKappa(EpochMetric):
             is run on the first batch of data to ensure there are
             no issues. User will be warned in case there are any issues computing the function.
         device: optional device specification for internal storage.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -46,6 +49,8 @@ class CohenKappa(EpochMetric):
 
             0.4285...
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -54,6 +59,7 @@ def __init__(
         weights: Optional[str] = None,
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         try:
             from sklearn.metrics import cohen_kappa_score  # noqa: F401
@@ -72,6 +78,7 @@ def __init__(
             output_transform=output_transform,
             check_compute_fn=check_compute_fn,
             device=device,
+            skip_unrolling=skip_unrolling,
         )
 
     def get_cohen_kappa_fn(self) -> Callable[[torch.Tensor, torch.Tensor], float]:
diff --git a/ignite/metrics/confusion_matrix.py b/ignite/metrics/confusion_matrix.py
index 75a9f9848a29..95fd06897ecf 100644
--- a/ignite/metrics/confusion_matrix.py
+++ b/ignite/metrics/confusion_matrix.py
@@ -34,6 +34,9 @@ class ConfusionMatrix(Metric):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
         The confusion matrix is formatted such that columns are predictions and rows are targets.
@@ -98,6 +101,9 @@ def binary_one_hot_output_transform(output):
 
             tensor([[2, 1],
                     [1, 1]])
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("confusion_matrix", "_num_examples")
@@ -108,6 +114,7 @@ def __init__(
         average: Optional[str] = None,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = True,
     ):
         if average is not None and average not in ("samples", "recall", "precision"):
             raise ValueError("Argument average can None or one of 'samples', 'recall', 'precision'")
@@ -118,7 +125,9 @@ def __init__(
         self.num_classes = num_classes
         self._num_examples = 0
         self.average = average
-        super(ConfusionMatrix, self).__init__(output_transform=output_transform, device=device)
+        super(ConfusionMatrix, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/cosine_similarity.py b/ignite/metrics/cosine_similarity.py
index a9760530ea7b..9b9e44d90655 100644
--- a/ignite/metrics/cosine_similarity.py
+++ b/ignite/metrics/cosine_similarity.py
@@ -29,6 +29,9 @@ class CosineSimilarity(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -65,6 +68,9 @@ class CosineSimilarity(Metric):
         .. testoutput::
 
             0.5080491304397583
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -72,8 +78,9 @@ def __init__(
         eps: float = 1e-8,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
-        super().__init__(output_transform, device)
+        super().__init__(output_transform, device, skip_unrolling=skip_unrolling)
 
         self.eps = eps
 
diff --git a/ignite/metrics/entropy.py b/ignite/metrics/entropy.py
index b3d0cff21b6c..9aa75be54f20 100644
--- a/ignite/metrics/entropy.py
+++ b/ignite/metrics/entropy.py
@@ -30,6 +30,9 @@ class Entropy(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -58,6 +61,9 @@ class Entropy(Metric):
         .. testoutput::
 
             0.8902875582377116
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_entropies", "_num_examples")
@@ -80,9 +86,13 @@ def update(self, output: Sequence[torch.Tensor]) -> None:
 
         prob = F.softmax(y_pred, dim=1)
         log_prob = F.log_softmax(y_pred, dim=1)
+
+        self._update(prob, log_prob)
+
+    def _update(self, prob: torch.Tensor, log_prob: torch.Tensor) -> None:
         entropy_sum = -torch.sum(prob * log_prob)
         self._sum_of_entropies += entropy_sum.to(self._device)
-        self._num_examples += y_pred.shape[0]
+        self._num_examples += prob.shape[0]
 
     @sync_all_reduce("_sum_of_entropies", "_num_examples")
     def compute(self) -> float:
diff --git a/ignite/metrics/epoch_metric.py b/ignite/metrics/epoch_metric.py
index 116a841e49ff..5918b6428120 100644
--- a/ignite/metrics/epoch_metric.py
+++ b/ignite/metrics/epoch_metric.py
@@ -65,6 +65,9 @@ def mse_fn(y_preds, y_targets):
     Warnings:
         EpochMetricWarning: User is warned that there are issues with ``compute_fn`` on a batch of data processed.
         To disable the warning, set ``check_compute_fn=False``.
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_predictions", "_targets")
@@ -75,6 +78,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         check_compute_fn: bool = True,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
         if not callable(compute_fn):
             raise TypeError("Argument compute_fn should be callable.")
@@ -82,7 +86,9 @@ def __init__(
         self.compute_fn = compute_fn
         self._check_compute_fn = check_compute_fn
 
-        super(EpochMetric, self).__init__(output_transform=output_transform, device=device)
+        super(EpochMetric, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/frequency.py b/ignite/metrics/frequency.py
index 8c63edd1ec97..921471bdeba2 100644
--- a/ignite/metrics/frequency.py
+++ b/ignite/metrics/frequency.py
@@ -11,6 +11,18 @@
 class Frequency(Metric):
     """Provides metrics for the number of examples processed per second.
 
+    Args:
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+        device: specifies which device updates are accumulated on. Setting the metric's
+            device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
+            default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
     Examples:
         For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
 
@@ -36,12 +48,18 @@ class Frequency(Metric):
             ProgressBar(persist=True).attach(trainer, metric_names=['wps'])
             # Progress bar will look like
             # Epoch [2/10]: [50/100]  50%|█████      , wps=400 [00:17<00:35]
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
+        self,
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
-        super(Frequency, self).__init__(output_transform=output_transform, device=device)
+        super(Frequency, self).__init__(output_transform=output_transform, device=device, skip_unrolling=skip_unrolling)
 
     @reinit__is_reduced
     def reset(self) -> None:
@@ -49,7 +67,7 @@ def reset(self) -> None:
         self._acc = 0
         self._n = 0
         self._elapsed = 0.0
-        super(Frequency, self).reset()
+        super(Frequency, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: int) -> None:
diff --git a/ignite/metrics/gan/fid.py b/ignite/metrics/gan/fid.py
index 188bad5035a2..b74efe3e0e9a 100644
--- a/ignite/metrics/gan/fid.py
+++ b/ignite/metrics/gan/fid.py
@@ -226,7 +226,7 @@ def reset(self) -> None:
         self._test_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
         self._num_examples: int = 0
 
-        super(FID, self).reset()
+        super(FID, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: Sequence[torch.Tensor]) -> None:
diff --git a/ignite/metrics/gan/inception_score.py b/ignite/metrics/gan/inception_score.py
index 60b1d4785f71..b2a179fa65d1 100644
--- a/ignite/metrics/gan/inception_score.py
+++ b/ignite/metrics/gan/inception_score.py
@@ -106,7 +106,7 @@ def reset(self) -> None:
         self._prob_total = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
         self._total_kl_d = torch.zeros(self._num_features, dtype=torch.float64, device=self._device)
 
-        super(InceptionScore, self).reset()
+        super(InceptionScore, self).reset()  # type: ignore
 
     @reinit__is_reduced
     def update(self, output: torch.Tensor) -> None:
diff --git a/ignite/metrics/js_divergence.py b/ignite/metrics/js_divergence.py
new file mode 100644
index 000000000000..204995dd0ae8
--- /dev/null
+++ b/ignite/metrics/js_divergence.py
@@ -0,0 +1,106 @@
+import torch
+import torch.nn.functional as F
+from packaging.version import Version
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.kl_divergence import KLDivergence
+from ignite.metrics.metric import sync_all_reduce
+
+__all__ = ["JSDivergence"]
+
+TORCH_VERSION_GE_160 = Version(torch.__version__) >= Version("1.6.0")
+
+
+class JSDivergence(KLDivergence):
+    r"""Calculates the mean of `Jensen-Shannon (JS) divergence
+    <https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence>`_.
+
+    .. math::
+       \begin{align*}
+           D_\text{JS}(\mathbf{p}_i \| \mathbf{q}_i) &= \frac{1}{2} D_\text{KL}(\mathbf{p}_i \| \mathbf{m}_i)
+           + \frac{1}{2} D_\text{KL}(\mathbf{q}_i \| \mathbf{m}_i), \\
+           \mathbf{m}_i &= \frac{1}{2}(\mathbf{p}_i + \mathbf{q}_i), \\
+           D_\text{KL}(\mathbf{p}_i \| \mathbf{q}_i) &= \sum_{c=1}^C p_{i,c} \log \frac{p_{i,c}}{q_{i,c}}.
+       \end{align*}
+
+    where :math:`\mathbf{p}_i` and :math:`\mathbf{q}_i` are the ground truth and prediction probability tensors,
+    and :math:`D_\text{KL}` is the KL-divergence.
+
+    - ``update`` must receive output of the form ``(y_pred, y)``.
+    - ``y_pred`` and ``y`` are expected to be the unnormalized logits for each class. :math:`(B, C)` (classification)
+      or :math:`(B, C, ...)` (e.g., image segmentation) shapes are allowed.
+
+    Args:
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
+    Examples:
+        To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
+        The output of the engine's ``process_function`` needs to be in the format of
+        ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y, ...}``. If not, ``output_tranform`` can be added
+        to the metric to transform the output into the form expected by the metric.
+
+        For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
+
+        .. include:: defaults.rst
+            :start-after: :orphan:
+
+        .. testcode::
+
+            metric = JSDivergence()
+            metric.attach(default_evaluator, 'js-div')
+            y_true = torch.tensor([
+                [ 0.0000, -2.3026, -2.3026],
+                [ 1.3863,  1.6094,  1.6094],
+                [ 0.0000,  0.6931,  1.0986]
+            ])
+            y_pred = torch.tensor([
+                [ 0.0000,  0.6931,  1.0986],
+                [ 1.3863,  1.6094,  1.6094],
+                [ 0.0000, -2.3026, -2.3026]
+            ])
+            state = default_evaluator.run([[y_pred, y_true]])
+            print(state.metrics['js-div'])
+
+        .. testoutput::
+
+           0.16266516844431558
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
+    """
+
+    def _update(self, y_pred: torch.Tensor, y: torch.Tensor) -> None:
+        y_pred_prob = F.softmax(y_pred, dim=1)
+        y_prob = F.softmax(y, dim=1)
+        m_prob = (y_pred_prob + y_prob) / 2
+        m_log = m_prob.log()
+
+        if TORCH_VERSION_GE_160:
+            # log_target option can be used from 1.6.0
+            y_pred_log = F.log_softmax(y_pred, dim=1)
+            y_log = F.log_softmax(y, dim=1)
+            self._sum_of_kl += (
+                F.kl_div(m_log, y_pred_log, log_target=True, reduction="sum")
+                + F.kl_div(m_log, y_log, log_target=True, reduction="sum")
+            ).to(self._device)
+        else:
+            # y_pred and y are expected to be probabilities
+            self._sum_of_kl += (
+                F.kl_div(m_log, y_pred_prob, reduction="sum") + F.kl_div(m_log, y_prob, reduction="sum")
+            ).to(self._device)
+
+    @sync_all_reduce("_sum_of_kl", "_num_examples")
+    def compute(self) -> float:
+        if self._num_examples == 0:
+            raise NotComputableError("JSDivergence must have at least one example before it can be computed.")
+        return self._sum_of_kl.item() / (self._num_examples * 2)
diff --git a/ignite/metrics/kl_divergence.py b/ignite/metrics/kl_divergence.py
new file mode 100644
index 000000000000..4f285ea797a9
--- /dev/null
+++ b/ignite/metrics/kl_divergence.py
@@ -0,0 +1,119 @@
+from typing import Sequence
+
+import torch
+import torch.nn.functional as F
+from packaging.version import Version
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
+
+__all__ = ["KLDivergence"]
+
+TORCH_VERSION_GE_160 = Version(torch.__version__) >= Version("1.6.0")
+
+
+class KLDivergence(Metric):
+    r"""Calculates the mean of `Kullback-Leibler (KL) divergence
+    <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>`_.
+
+    .. math:: D_\text{KL}(\mathbf{p}_i \| \mathbf{q}_i) = \sum_{c=1}^C p_{i,c} \log \frac{p_{i,c}}{q_{i,c}}
+
+    where :math:`\mathbf{p}_i` and :math:`\mathbf{q}_i` are the ground truth and prediction probability tensors.
+
+    - ``update`` must receive output of the form ``(y_pred, y)``.
+    - ``y_pred`` and ``y`` are expected to be the unnormalized logits for each class. :math:`(B, C)` (classification)
+      or :math:`(B, C, ...)` (e.g., image segmentation) shapes are allowed.
+
+    Args:
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
+    Examples:
+        To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
+        The output of the engine's ``process_function`` needs to be in the format of
+        ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y, ...}``. If not, ``output_tranform`` can be added
+        to the metric to transform the output into the form expected by the metric.
+
+        For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
+
+        .. include:: defaults.rst
+            :start-after: :orphan:
+
+        .. testcode::
+
+            metric = KLDivergence()
+            metric.attach(default_evaluator, 'kl-div')
+            y_true = torch.tensor([
+                [ 0.0000, -2.3026, -2.3026],
+                [ 1.3863,  1.6094,  1.6094],
+                [ 0.0000,  0.6931,  1.0986]
+            ])
+            y_pred = torch.tensor([
+                [ 0.0000,  0.6931,  1.0986],
+                [ 1.3863,  1.6094,  1.6094],
+                [ 0.0000, -2.3026, -2.3026]
+            ])
+            state = default_evaluator.run([[y_pred, y_true]])
+            print(state.metrics['kl-div'])
+
+        .. testoutput::
+
+           0.7220296859741211
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
+    """
+
+    _state_dict_all_req_keys = ("_sum_of_kl", "_num_examples")
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._sum_of_kl = torch.tensor(0.0, device=self._device)
+        self._num_examples = 0
+
+    @reinit__is_reduced
+    def update(self, output: Sequence[torch.Tensor]) -> None:
+        y_pred, y = output[0].detach(), output[1].detach()
+        if y_pred.shape != y.shape:
+            raise ValueError(f"y_pred and y must be in the same shape, got {y_pred.shape} != {y.shape}.")
+
+        if y_pred.ndim >= 3:
+            num_classes = y_pred.shape[1]
+            # (B, C, ...) -> (B, ..., C) -> (B*..., C)
+            # regarding as B*... predictions
+            y_pred = y_pred.movedim(1, -1).reshape(-1, num_classes)
+            y = y.movedim(1, -1).reshape(-1, num_classes)
+        elif y_pred.ndim == 1:
+            raise ValueError(f"y_pred must be in the shape of (B, C) or (B, C, ...), got {y_pred.shape}.")
+
+        self._num_examples += y_pred.shape[0]
+        self._update(y_pred, y)
+
+    def _update(self, y_pred: torch.Tensor, y: torch.Tensor) -> None:
+        y_pred = F.log_softmax(y_pred, dim=1)
+
+        if TORCH_VERSION_GE_160:
+            # log_target option can be used from 1.6.0
+            y = F.log_softmax(y, dim=1)
+            kl_sum = F.kl_div(y_pred, y, log_target=True, reduction="sum")
+        else:
+            # y is expected to be a probability tensor
+            y = F.softmax(y, dim=1)
+            kl_sum = F.kl_div(y_pred, y, reduction="sum")
+
+        self._sum_of_kl += kl_sum.to(self._device)
+
+    @sync_all_reduce("_sum_of_kl", "_num_examples")
+    def compute(self) -> float:
+        if self._num_examples == 0:
+            raise NotComputableError("KLDivergence must have at least one example before it can be computed.")
+        return self._sum_of_kl.item() / self._num_examples
diff --git a/ignite/metrics/loss.py b/ignite/metrics/loss.py
index 7182e7033d54..2be0a7d2387f 100644
--- a/ignite/metrics/loss.py
+++ b/ignite/metrics/loss.py
@@ -29,6 +29,9 @@ class Loss(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether input should be unrolled or not before it is passed to to loss_fn.
+            Should be true for multi-output model, for example, if ``y_pred`` contains multi-ouput as
+            ``(y_pred_a, y_pred_b)``
 
     Attributes:
         required_output_keys: dictionary defines required keys to be found in ``engine.state.output`` if the
@@ -62,6 +65,8 @@ class Loss(Metric):
 
             -0.3499999...
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     required_output_keys = ("y_pred", "y", "criterion_kwargs")
@@ -73,8 +78,9 @@ def __init__(
         output_transform: Callable = lambda x: x,
         batch_size: Callable = len,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
-        super(Loss, self).__init__(output_transform, device=device)
+        super(Loss, self).__init__(output_transform, device=device, skip_unrolling=skip_unrolling)
         self._loss_fn = loss_fn
         self._batch_size = batch_size
 
diff --git a/ignite/metrics/maximum_mean_discrepancy.py b/ignite/metrics/maximum_mean_discrepancy.py
new file mode 100644
index 000000000000..8a5d04a2c858
--- /dev/null
+++ b/ignite/metrics/maximum_mean_discrepancy.py
@@ -0,0 +1,148 @@
+from typing import Callable, Sequence
+
+import torch
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics.metric import Metric, reinit__is_reduced, sync_all_reduce
+
+__all__ = ["MaximumMeanDiscrepancy"]
+
+
+class MaximumMeanDiscrepancy(Metric):
+    r"""Calculates the mean of `maximum mean discrepancy (MMD)
+    <https://www.onurtunali.com/ml/2019/03/08/maximum-mean-discrepancy-in-machine-learning.html>`_.
+
+    .. math::
+       \begin{align*}
+           \text{MMD}^2 (P,Q) &= \underset{\| f \| \leq 1}{\text{sup}} | \mathbb{E}_{X\sim P}[f(X)]
+           - \mathbb{E}_{Y\sim Q}[f(Y)] |^2 \\
+           &\approx \frac{1}{B(B-1)} \sum_{i=1}^B \sum_{\substack{j=1 \\ j\neq i}}^B k(\mathbf{x}_i,\mathbf{x}_j)
+           -\frac{2}{B^2}\sum_{i=1}^B \sum_{j=1}^B k(\mathbf{x}_i,\mathbf{y}_j)
+           + \frac{1}{B(B-1)} \sum_{i=1}^B \sum_{\substack{j=1 \\ j\neq i}}^B k(\mathbf{y}_i,\mathbf{y}_j)
+       \end{align*}
+
+    where :math:`B` is the batch size, and :math:`\mathbf{x}_i` and :math:`\mathbf{y}_j` are
+    feature vectors sampled from :math:`P` and :math:`Q`, respectively.
+    :math:`k(\mathbf{x},\mathbf{y})=\exp(-\| \mathbf{x}-\mathbf{y} \|^2/ 2\sigma^2)` is the Gaussian RBF kernel.
+
+    This metric computes the MMD for each batch and takes the average.
+
+    More details can be found in `Gretton et al. 2012`__.
+
+    __ https://www.jmlr.org/papers/volume13/gretton12a/gretton12a.pdf
+
+    - ``update`` must receive output of the form ``(x, y)``.
+    - ``x`` and ``y`` are expected to be in the same shape :math:`(B, \ldots)`.
+
+    Args:
+        var: the bandwidth :math:`\sigma^2` of the kernel. Default: 1.0
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, this metric requires the output as ``(x, y)``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
+    Examples:
+        To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
+        The output of the engine's ``process_function`` needs to be in the format of
+        ``(x, y)``. If not, ``output_tranform`` can be added
+        to the metric to transform the output into the form expected by the metric.
+
+        For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
+
+        .. include:: defaults.rst
+            :start-after: :orphan:
+
+        .. testcode::
+
+            metric = MaximumMeanDiscrepancy()
+            metric.attach(default_evaluator, "mmd")
+            x = torch.tensor([[-0.80324818, -0.95768364, -0.03807209],
+                            [-0.11059691, -0.38230813, -0.4111988],
+                            [-0.8864329, -0.02890403, -0.60119252],
+                            [-0.68732452, -0.12854739, -0.72095073],
+                            [-0.62604613, -0.52368328, -0.24112842]])
+            y = torch.tensor([[0.0686768, 0.80502737, 0.53321717],
+                            [0.83849465, 0.59099726, 0.76385441],
+                            [0.68688272, 0.56833803, 0.98100778],
+                            [0.55267761, 0.13084654, 0.45382906],
+                            [0.0754253, 0.70317304, 0.4756805]])
+            state = default_evaluator.run([[x, y]])
+            print(state.metrics["mmd"])
+
+        .. testoutput::
+
+           1.072697639465332
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
+    """
+
+    _state_dict_all_req_keys = ("_xx_sum", "_yy_sum", "_xy_sum", "_num_batches")
+
+    def __init__(
+        self,
+        var: float = 1.0,
+        output_transform: Callable = lambda x: x,
+        device: torch.device = torch.device("cpu"),
+        skip_unrolling: bool = False,
+    ):
+        self.var = var
+        super().__init__(output_transform, device, skip_unrolling=skip_unrolling)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        self._xx_sum = torch.tensor(0.0, device=self._device)
+        self._yy_sum = torch.tensor(0.0, device=self._device)
+        self._xy_sum = torch.tensor(0.0, device=self._device)
+        self._num_batches = 0
+
+    @reinit__is_reduced
+    def update(self, output: Sequence[torch.Tensor]) -> None:
+        x, y = output[0].detach(), output[1].detach()
+        if x.shape != y.shape:
+            raise ValueError(f"x and y must be in the same shape, got {x.shape} != {y.shape}.")
+
+        if x.ndim >= 3:
+            x = x.flatten(start_dim=1)
+            y = y.flatten(start_dim=1)
+        elif x.ndim == 1:
+            raise ValueError(f"x must be in the shape of (B, ...), got {x.shape}.")
+
+        xx, yy, zz = torch.mm(x, x.t()), torch.mm(y, y.t()), torch.mm(x, y.t())
+        rx = xx.diag().unsqueeze(0).expand_as(xx)
+        ry = yy.diag().unsqueeze(0).expand_as(yy)
+
+        dxx = rx.t() + rx - 2.0 * xx
+        dyy = ry.t() + ry - 2.0 * yy
+        dxy = rx.t() + ry - 2.0 * zz
+
+        v = self.var
+        XX = torch.exp(-0.5 * dxx / v)
+        YY = torch.exp(-0.5 * dyy / v)
+        XY = torch.exp(-0.5 * dxy / v)
+
+        # unbiased
+        n = x.shape[0]
+        XX = (XX.sum() - n) / (n * (n - 1))
+        YY = (YY.sum() - n) / (n * (n - 1))
+        XY = XY.sum() / (n * n)
+
+        self._xx_sum += XX.to(self._device)
+        self._yy_sum += YY.to(self._device)
+        self._xy_sum += XY.to(self._device)
+
+        self._num_batches += 1
+
+    @sync_all_reduce("_xx_sum", "_yy_sum", "_xy_sum", "_num_batches")
+    def compute(self) -> float:
+        if self._num_batches == 0:
+            raise NotComputableError("MaximumMeanDiscrepacy must have at least one batch before it can be computed.")
+        mmd2 = (self._xx_sum + self._yy_sum - 2.0 * self._xy_sum).clamp(min=0.0) / self._num_batches
+        return mmd2.sqrt().item()
diff --git a/ignite/metrics/mean_absolute_error.py b/ignite/metrics/mean_absolute_error.py
index eb90d3aa3c24..12fee3f12327 100644
--- a/ignite/metrics/mean_absolute_error.py
+++ b/ignite/metrics/mean_absolute_error.py
@@ -26,6 +26,9 @@ class MeanAbsoluteError(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -57,6 +60,9 @@ class MeanAbsoluteError(Metric):
         .. testoutput::
 
             2.9375
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_absolute_errors", "_num_examples")
diff --git a/ignite/metrics/mean_pairwise_distance.py b/ignite/metrics/mean_pairwise_distance.py
index 79676564e5fb..dd6910347e9b 100644
--- a/ignite/metrics/mean_pairwise_distance.py
+++ b/ignite/metrics/mean_pairwise_distance.py
@@ -26,6 +26,9 @@ class MeanPairwiseDistance(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -57,6 +60,9 @@ class MeanPairwiseDistance(Metric):
         .. testoutput::
 
             1.5955...
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_distances", "_num_examples")
@@ -67,8 +73,9 @@ def __init__(
         eps: float = 1e-6,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
-        super(MeanPairwiseDistance, self).__init__(output_transform, device=device)
+        super(MeanPairwiseDistance, self).__init__(output_transform, device=device, skip_unrolling=False)
         self._p = p
         self._eps = eps
 
diff --git a/ignite/metrics/mean_squared_error.py b/ignite/metrics/mean_squared_error.py
index 3407b4adcb70..97630f6ce7c9 100644
--- a/ignite/metrics/mean_squared_error.py
+++ b/ignite/metrics/mean_squared_error.py
@@ -26,6 +26,9 @@ class MeanSquaredError(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -57,6 +60,9 @@ class MeanSquaredError(Metric):
         .. testoutput::
 
             3.828125
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_squared_errors", "_num_examples")
diff --git a/ignite/metrics/metric.py b/ignite/metrics/metric.py
index 39e5cb745222..453fb1291e94 100644
--- a/ignite/metrics/metric.py
+++ b/ignite/metrics/metric.py
@@ -233,6 +233,59 @@ class Metric(Serializable, metaclass=ABCMeta):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
+            Examples:
+                The following example shows a custom loss metric that expects input from a multi-output model.
+
+                .. code-block:: python
+
+                    import torch
+                    import torch.nn as nn
+                    import torch.nn.functional as F
+
+                    from ignite.engine import create_supervised_evaluator
+                    from ignite.metrics import Loss
+
+                    class MyLoss(nn.Module):
+                        def __init__(self, ca: float = 1.0, cb: float = 1.0) -> None:
+                            super().__init__()
+                            self.ca = ca
+                            self.cb = cb
+
+                        def forward(self,
+                                    y_pred: Tuple[torch.Tensor, torch.Tensor],
+                                    y_true: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+                            a_true, b_true = y_true
+                            a_pred, b_pred = y_pred
+                            return self.ca * F.mse_loss(a_pred, a_true) + self.cb * F.cross_entropy(b_pred, b_true)
+
+
+                    def prepare_batch(batch, device, non_blocking):
+                        return torch.rand(4, 1), (torch.rand(4, 1), torch.rand(4, 2))
+
+
+                    class MyModel(nn.Module):
+
+                        def forward(self, x):
+                            return torch.rand(4, 1), torch.rand(4, 2)
+
+
+                    model = MyModel()
+
+                    device = "cpu"
+                    loss = MyLoss(0.5, 1.0)
+                    metrics = {
+                        "Loss": Loss(loss, skip_unrolling=True)
+                    }
+                    train_evaluator = create_supervised_evaluator(model, metrics, device, prepare_batch=prepare_batch)
+
+
+                    data = range(10)
+                    train_evaluator.run(data)
+                    train_evaluator.state.metrics["Loss"]
 
     Attributes:
         required_output_keys: dictionary defines required keys to be found in ``engine.state.output`` if the
@@ -292,6 +345,9 @@ def compute(self):
 
     .. versionchanged:: 0.4.2
         ``required_output_keys`` became public attribute.
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     # public class attribute
@@ -300,7 +356,10 @@ def compute(self):
     _required_output_keys = required_output_keys
 
     def __init__(
-        self, output_transform: Callable = lambda x: x, device: Union[str, torch.device] = torch.device("cpu")
+        self,
+        output_transform: Callable = lambda x: x,
+        device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         self._output_transform = output_transform
 
@@ -309,6 +368,7 @@ def __init__(
             raise ValueError("Cannot create metric on an XLA device. Use device='cpu' instead.")
 
         self._device = torch.device(device)
+        self._skip_unrolling = skip_unrolling
         self.reset()
 
     @abstractmethod
@@ -390,7 +450,11 @@ def iteration_completed(self, engine: Engine) -> None:
                 )
             output = tuple(output[k] for k in self.required_output_keys)
 
-        if isinstance(output, Sequence) and all([_is_list_of_tensors_or_numbers(o) for o in output]):
+        if (
+            (not self._skip_unrolling)
+            and isinstance(output, Sequence)
+            and all([_is_list_of_tensors_or_numbers(o) for o in output])
+        ):
             if not (len(output) == 2 and len(output[0]) == len(output[1])):
                 raise ValueError(
                     f"Output should have 2 items of the same length, "
@@ -704,6 +768,9 @@ def __floordiv__(self, other: Any) -> "MetricsLambda":
     def __getattr__(self, attr: str) -> Callable:
         from ignite.metrics.metrics_lambda import MetricsLambda
 
+        if attr.startswith("__") and attr.endswith("__"):
+            return object.__getattribute__(self, attr)
+
         def fn(x: Metric, *args: Any, **kwargs: Any) -> Any:
             return getattr(x, attr)(*args, **kwargs)
 
diff --git a/ignite/metrics/metric_group.py b/ignite/metrics/metric_group.py
new file mode 100644
index 000000000000..58a52f658ae1
--- /dev/null
+++ b/ignite/metrics/metric_group.py
@@ -0,0 +1,54 @@
+from typing import Any, Callable, Dict, Sequence
+
+import torch
+
+from ignite.metrics import Metric
+
+
+class MetricGroup(Metric):
+    """
+    A class for grouping metrics so that user could manage them easier.
+
+    Args:
+        metrics: a dictionary of names to metric instances.
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. `output_transform` of each metric in the group is also
+            called upon its update.
+
+    Examples:
+        We construct a group of metrics, attach them to the engine at once and retrieve their result.
+
+        .. code-block:: python
+
+           import torch
+
+           metric_group = MetricGroup({'acc': Accuracy(), 'precision': Precision(), 'loss': Loss(nn.NLLLoss())})
+           metric_group.attach(default_evaluator, "eval_metrics")
+           y_true = torch.tensor([1, 0, 1, 1, 0, 1])
+           y_pred = torch.tensor([1, 0, 1, 0, 1, 1])
+           state = default_evaluator.run([[y_pred, y_true]])
+
+           # Metrics individually available in `state.metrics`
+           state.metrics["acc"], state.metrics["precision"], state.metrics["loss"]
+
+           # And also altogether
+           state.metrics["eval_metrics"]
+    """
+
+    _state_dict_all_req_keys = ("metrics",)
+
+    def __init__(self, metrics: Dict[str, Metric], output_transform: Callable = lambda x: x):
+        self.metrics = metrics
+        super(MetricGroup, self).__init__(output_transform=output_transform)
+
+    def reset(self) -> None:
+        for m in self.metrics.values():
+            m.reset()
+
+    def update(self, output: Sequence[torch.Tensor]) -> None:
+        for m in self.metrics.values():
+            m.update(m._output_transform(output))
+
+    def compute(self) -> Dict[str, Any]:
+        return {k: m.compute() for k, m in self.metrics.items()}
diff --git a/ignite/metrics/multilabel_confusion_matrix.py b/ignite/metrics/multilabel_confusion_matrix.py
index 2a7b25d68c67..e4da5ea5c70d 100644
--- a/ignite/metrics/multilabel_confusion_matrix.py
+++ b/ignite/metrics/multilabel_confusion_matrix.py
@@ -37,6 +37,9 @@ class MultiLabelConfusionMatrix(Metric):
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
         normalized: whether to normalize confusion matrix by its sum or not.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Example:
 
@@ -79,6 +82,8 @@ class MultiLabelConfusionMatrix(Metric):
 
     .. versionadded:: 0.4.5
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("confusion_matrix", "_num_examples")
@@ -89,6 +94,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
         normalized: bool = False,
+        skip_unrolling: bool = False,
     ):
         if num_classes <= 1:
             raise ValueError("Argument num_classes needs to be > 1")
@@ -96,7 +102,9 @@ def __init__(
         self.num_classes = num_classes
         self._num_examples = 0
         self.normalized = normalized
-        super(MultiLabelConfusionMatrix, self).__init__(output_transform=output_transform, device=device)
+        super(MultiLabelConfusionMatrix, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/mutual_information.py b/ignite/metrics/mutual_information.py
new file mode 100644
index 000000000000..dfacd29527d0
--- /dev/null
+++ b/ignite/metrics/mutual_information.py
@@ -0,0 +1,100 @@
+import torch
+
+from ignite.exceptions import NotComputableError
+from ignite.metrics import Entropy
+from ignite.metrics.metric import reinit__is_reduced, sync_all_reduce
+
+__all__ = ["MutualInformation"]
+
+
+class MutualInformation(Entropy):
+    r"""Calculates the `mutual information <https://en.wikipedia.org/wiki/Mutual_information>`_
+    between input :math:`X` and prediction :math:`Y`.
+
+    .. math::
+       \begin{align*}
+            I(X;Y) &= H(Y) - H(Y|X) = H \left( \frac{1}{N}\sum_{i=1}^N \hat{\mathbf{p}}_i \right)
+            - \frac{1}{N}\sum_{i=1}^N H(\hat{\mathbf{p}}_i), \\
+            H(\mathbf{p}) &= -\sum_{c=1}^C p_c \log p_c.
+       \end{align*}
+
+    where :math:`\hat{\mathbf{p}}_i` is the prediction probability vector for :math:`i`-th input,
+    and :math:`H(\mathbf{p})` is the entropy of :math:`\mathbf{p}`.
+
+    Intuitively, this metric measures how well input data are clustered by classes in the feature space [1].
+
+    [1] https://proceedings.mlr.press/v70/hu17b.html
+
+    - ``update`` must receive output of the form ``(y_pred, y)`` while ``y`` is not used in this metric.
+    - ``y_pred`` is expected to be the unnormalized logits for each class. :math:`(B, C)` (classification)
+      or :math:`(B, C, ...)` (e.g., image segmentation) shapes are allowed.
+
+    Args:
+        output_transform: a callable that is used to transform the
+            :class:`~ignite.engine.engine.Engine`'s ``process_function``'s output into the
+            form expected by the metric. This can be useful if, for example, you have a multi-output model and
+            you want to compute the metric with respect to one of the outputs.
+            By default, metrics require the output as ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y}``.
+        device: specifies which device updates are accumulated on. Setting the
+            metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
+            non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
+
+    Examples:
+        To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
+        The output of the engine's ``process_function`` needs to be in the format of
+        ``(y_pred, y)`` or ``{'y_pred': y_pred, 'y': y, ...}``. If not, ``output_tranform`` can be added
+        to the metric to transform the output into the form expected by the metric.
+
+        For more information on how metric works with :class:`~ignite.engine.engine.Engine`, visit :ref:`attach-engine`.
+
+        .. include:: defaults.rst
+            :start-after: :orphan:
+
+        .. testcode::
+
+            metric = MutualInformation()
+            metric.attach(default_evaluator, 'mutual_information')
+            y_true = torch.tensor([0, 1, 2])  # not considered in the MutualInformation metric.
+            y_pred = torch.tensor([
+                [ 0.0000,  0.6931,  1.0986],
+                [ 1.3863,  1.6094,  1.6094],
+                [ 0.0000, -2.3026, -2.3026]
+            ])
+            state = default_evaluator.run([[y_pred, y_true]])
+            print(state.metrics['mutual_information'])
+
+        .. testoutput::
+
+           0.18599730730056763
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
+    """
+
+    _state_dict_all_req_keys = ("_sum_of_probabilities",)
+
+    @reinit__is_reduced
+    def reset(self) -> None:
+        super().reset()
+        self._sum_of_probabilities = torch.tensor(0.0, device=self._device)
+
+    def _update(self, prob: torch.Tensor, log_prob: torch.Tensor) -> None:
+        super()._update(prob, log_prob)
+        # We can't use += below as _sum_of_probabilities can be a scalar and prob.sum(dim=0) is a vector
+        self._sum_of_probabilities = self._sum_of_probabilities + prob.sum(dim=0).to(self._device)
+
+    @sync_all_reduce("_sum_of_probabilities", "_sum_of_entropies", "_num_examples")
+    def compute(self) -> float:
+        n = self._num_examples
+        if n == 0:
+            raise NotComputableError("MutualInformation must have at least one example before it can be computed.")
+
+        marginal_prob = self._sum_of_probabilities / n
+        marginal_ent = -(marginal_prob * torch.log(marginal_prob)).sum()
+        conditional_ent = self._sum_of_entropies / n
+        mi = marginal_ent - conditional_ent
+        mi = torch.clamp(mi, min=0.0)  # mutual information cannot be negative
+        return float(mi.item())
diff --git a/ignite/metrics/precision.py b/ignite/metrics/precision.py
index 31fbd42b19b4..b25154eae358 100644
--- a/ignite/metrics/precision.py
+++ b/ignite/metrics/precision.py
@@ -21,6 +21,7 @@ def __init__(
         average: Optional[Union[bool, str]] = False,
         is_multilabel: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         if not (average is None or isinstance(average, bool) or average in ["macro", "micro", "weighted", "samples"]):
             raise ValueError(
@@ -35,7 +36,7 @@ def __init__(
         self.eps = 1e-20
         self._updated = False
         super(_BasePrecisionRecall, self).__init__(
-            output_transform=output_transform, is_multilabel=is_multilabel, device=device
+            output_transform=output_transform, is_multilabel=is_multilabel, device=device, skip_unrolling=skip_unrolling
         )
 
     def _check_type(self, output: Sequence[torch.Tensor]) -> None:
@@ -241,6 +242,9 @@ class Precision(_BasePrecisionRecall):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
 
@@ -371,6 +375,9 @@ def thresholded_output_transform(output):
 
     .. versionchanged:: 0.4.10
             Some new options were added to `average` parameter.
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     @reinit__is_reduced
diff --git a/ignite/metrics/precision_recall_curve.py b/ignite/metrics/precision_recall_curve.py
index 29b3710b58c0..5b9ece27545c 100644
--- a/ignite/metrics/precision_recall_curve.py
+++ b/ignite/metrics/precision_recall_curve.py
@@ -33,6 +33,9 @@ class PrecisionRecallCurve(EpochMetric):
             <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
             #sklearn.metrics.precision_recall_curve>`_ is run on the first batch of data to ensure there are
             no issues. User will be warned in case there are any issues computing the function.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
         PrecisionRecallCurve expects y to be comprised of 0's and 1's. y_pred must either be probability estimates
@@ -69,6 +72,8 @@ def sigmoid_output_transform(output):
             Recall [1.0, 1.0, 1.0, 0.5, 0.0]
             Thresholds [0.0474, 0.5987, 0.7109, 0.9997]
 
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -76,12 +81,14 @@ def __init__(
         output_transform: Callable = lambda x: x,
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
         super(PrecisionRecallCurve, self).__init__(
             precision_recall_curve_compute_fn,  # type: ignore[arg-type]
             output_transform=output_transform,
             check_compute_fn=check_compute_fn,
             device=device,
+            skip_unrolling=skip_unrolling,
         )
 
     def compute(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:  # type: ignore[override]
diff --git a/ignite/metrics/psnr.py b/ignite/metrics/psnr.py
index 4251a24f8f13..be9dcb2b0b36 100644
--- a/ignite/metrics/psnr.py
+++ b/ignite/metrics/psnr.py
@@ -30,6 +30,9 @@ class PSNR(Metric):
         device: specifies which device updates are accumulated on.
             Setting the metric’s device to be the same as your update arguments ensures
             the update method is non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -79,6 +82,9 @@ def get_y_channel(output):
             16.7027966...
 
     .. versionadded:: 0.4.3
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_batchwise_psnr", "_num_examples")
@@ -88,8 +94,9 @@ def __init__(
         data_range: Union[int, float],
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
-        super().__init__(output_transform=output_transform, device=device)
+        super().__init__(output_transform=output_transform, device=device, skip_unrolling=skip_unrolling)
         self.data_range = data_range
 
     def _check_shape_dtype(self, output: Sequence[torch.Tensor]) -> None:
diff --git a/ignite/metrics/recall.py b/ignite/metrics/recall.py
index b570951e291f..46331decc058 100644
--- a/ignite/metrics/recall.py
+++ b/ignite/metrics/recall.py
@@ -94,6 +94,9 @@ class Recall(_BasePrecisionRecall):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
 
@@ -212,6 +215,9 @@ def thresholded_output_transform(output):
 
     .. versionchanged:: 0.4.10
             Some new options were added to `average` parameter.
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     @reinit__is_reduced
diff --git a/ignite/metrics/roc_auc.py b/ignite/metrics/roc_auc.py
index a4ff51a09a98..4cf2f1cdd1c1 100644
--- a/ignite/metrics/roc_auc.py
+++ b/ignite/metrics/roc_auc.py
@@ -39,6 +39,9 @@ class ROC_AUC(EpochMetric):
             sklearn.metrics.roc_auc_score>`_ is run on the first batch of data to ensure there are
             no issues. User will be warned in case there are any issues computing the function.
         device: optional device specification for internal storage.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
 
@@ -71,6 +74,9 @@ def sigmoid_output_transform(output):
         .. testoutput::
 
             0.6666...
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -78,6 +84,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         try:
             from sklearn.metrics import roc_auc_score  # noqa: F401
@@ -85,7 +92,11 @@ def __init__(
             raise ModuleNotFoundError("This contrib module requires scikit-learn to be installed.")
 
         super(ROC_AUC, self).__init__(
-            roc_auc_compute_fn, output_transform=output_transform, check_compute_fn=check_compute_fn, device=device
+            roc_auc_compute_fn,
+            output_transform=output_transform,
+            check_compute_fn=check_compute_fn,
+            device=device,
+            skip_unrolling=skip_unrolling,
         )
 
 
@@ -105,6 +116,9 @@ class RocCurve(EpochMetric):
             sklearn.metrics.roc_curve>`_ is run on the first batch of data to ensure there are
             no issues. User will be warned in case there are any issues computing the function.
         device: optional device specification for internal storage.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Note:
         RocCurve expects y to be comprised of 0's and 1's. y_pred must either be probability estimates or confidence
@@ -143,6 +157,9 @@ def sigmoid_output_transform(output):
 
     ..  versionchanged:: 0.4.11
         added `device` argument
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def __init__(
@@ -150,6 +167,7 @@ def __init__(
         output_transform: Callable = lambda x: x,
         check_compute_fn: bool = False,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
         try:
             from sklearn.metrics import roc_curve  # noqa: F401
@@ -161,6 +179,7 @@ def __init__(
             output_transform=output_transform,
             check_compute_fn=check_compute_fn,
             device=device,
+            skip_unrolling=skip_unrolling,
         )
 
     def compute(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:  # type: ignore[override]
diff --git a/ignite/metrics/root_mean_squared_error.py b/ignite/metrics/root_mean_squared_error.py
index ab2218d2372c..22605123bde5 100644
--- a/ignite/metrics/root_mean_squared_error.py
+++ b/ignite/metrics/root_mean_squared_error.py
@@ -26,6 +26,9 @@ class RootMeanSquaredError(MeanSquaredError):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -57,6 +60,9 @@ class RootMeanSquaredError(MeanSquaredError):
         .. testoutput::
 
             1.956559480312316
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     def compute(self) -> Union[torch.Tensor, float]:
diff --git a/ignite/metrics/running_average.py b/ignite/metrics/running_average.py
index 9b3b4efb4f3f..a622558b5abd 100644
--- a/ignite/metrics/running_average.py
+++ b/ignite/metrics/running_average.py
@@ -27,6 +27,9 @@ class RunningAverage(Metric):
             None when ``src`` is an instance of :class:`~ignite.metrics.metric.Metric`, as the running average will
             use the ``src``'s device. Otherwise, defaults to CPU. Only applicable when the computed value
             from the metric is a tensor.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
 
@@ -84,6 +87,9 @@ def log_running_avg_metrics():
             0.039208...
             0.038423...
             0.057655...
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     required_output_keys = None
@@ -96,6 +102,7 @@ def __init__(
         output_transform: Optional[Callable] = None,
         epoch_bound: Optional[bool] = None,
         device: Optional[Union[str, torch.device]] = None,
+        skip_unrolling: bool = False,
     ):
         if not (isinstance(src, Metric) or src is None):
             raise TypeError("Argument src should be a Metric or None.")
@@ -131,7 +138,9 @@ def output_transform(x: Any) -> Any:
             )
         self.epoch_bound = epoch_bound
         self.alpha = alpha
-        super(RunningAverage, self).__init__(output_transform=output_transform, device=device)
+        super(RunningAverage, self).__init__(
+            output_transform=output_transform, device=device, skip_unrolling=skip_unrolling
+        )
 
     @reinit__is_reduced
     def reset(self) -> None:
diff --git a/ignite/metrics/ssim.py b/ignite/metrics/ssim.py
index 6824c0b3f374..a662fb6f5413 100644
--- a/ignite/metrics/ssim.py
+++ b/ignite/metrics/ssim.py
@@ -33,6 +33,9 @@ class SSIM(Metric):
         device: specifies which device updates are accumulated on. Setting the metric's
             device to be the same as your ``update`` arguments ensures the ``update`` method is non-blocking. By
             default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -62,6 +65,9 @@ class SSIM(Metric):
             0.9218971...
 
     .. versionadded:: 0.4.2
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_sum_of_ssim", "_num_examples", "_kernel")
@@ -76,6 +82,7 @@ def __init__(
         gaussian: bool = True,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ):
         if isinstance(kernel_size, int):
             self.kernel_size: Sequence[int] = [kernel_size, kernel_size]
@@ -97,7 +104,7 @@ def __init__(
         if any(y <= 0 for y in self.sigma):
             raise ValueError(f"Expected sigma to have positive number. Got {sigma}.")
 
-        super(SSIM, self).__init__(output_transform=output_transform, device=device)
+        super(SSIM, self).__init__(output_transform=output_transform, device=device, skip_unrolling=skip_unrolling)
         self.gaussian = gaussian
         self.data_range = data_range
         self.c1 = (k1 * data_range) ** 2
diff --git a/ignite/metrics/top_k_categorical_accuracy.py b/ignite/metrics/top_k_categorical_accuracy.py
index 87da4c868731..611013c7905d 100644
--- a/ignite/metrics/top_k_categorical_accuracy.py
+++ b/ignite/metrics/top_k_categorical_accuracy.py
@@ -24,6 +24,9 @@ class TopKCategoricalAccuracy(Metric):
         device: specifies which device updates are accumulated on. Setting the
             metric's device to be the same as your ``update`` arguments ensures the ``update`` method is
             non-blocking. By default, CPU.
+        skip_unrolling: specifies whether output should be unrolled before being fed to update method. Should be
+            true for multi-output model, for example, if ``y_pred`` contains multi-ouput as ``(y_pred_a, y_pred_b)``
+            Alternatively, ``output_transform`` can be used to handle this.
 
     Examples:
         To use with ``Engine`` and ``process_function``, simply attach the metric instance to the engine.
@@ -71,6 +74,9 @@ def one_hot_to_binary_output_transform(output):
         .. testoutput::
 
             0.75
+
+    .. versionchanged:: 0.5.1
+        ``skip_unrolling`` argument is added.
     """
 
     _state_dict_all_req_keys = ("_num_correct", "_num_examples")
@@ -80,8 +86,9 @@ def __init__(
         k: int = 5,
         output_transform: Callable = lambda x: x,
         device: Union[str, torch.device] = torch.device("cpu"),
+        skip_unrolling: bool = False,
     ) -> None:
-        super(TopKCategoricalAccuracy, self).__init__(output_transform, device=device)
+        super(TopKCategoricalAccuracy, self).__init__(output_transform, device=device, skip_unrolling=skip_unrolling)
         self._k = k
 
     @reinit__is_reduced
diff --git a/ignite/utils.py b/ignite/utils.py
index 078e16663f68..1345e2bb0d86 100644
--- a/ignite/utils.py
+++ b/ignite/utils.py
@@ -2,6 +2,7 @@
 import functools
 import hashlib
 import logging
+import numbers
 import random
 import shutil
 import warnings
@@ -14,6 +15,7 @@
     "convert_tensor",
     "apply_to_tensor",
     "apply_to_type",
+    "_to_str_list",
     "to_onehot",
     "setup_logger",
     "manual_seed",
@@ -90,6 +92,82 @@ def _tree_map(
     return func(x, key=key)
 
 
+def _to_str_list(data: Any) -> List[str]:
+    """
+    Recursively flattens and formats complex data structures, including keys for
+    dictionaries, into a list of human-readable strings.
+
+    This function processes nested dictionaries, lists, tuples, numbers, and
+    PyTorch tensors, formatting numbers to four decimal places and handling
+    tensors with special formatting rules. It's particularly useful for logging,
+    debugging, or any scenario where a human-readable representation of complex,
+    nested data structures is required.
+
+    The function handles the following types:
+
+    - Numbers: Formatted to four decimal places.
+    - PyTorch tensors:
+        - Scalars are formatted to four decimal places.
+        - 1D tensors with more than 10 elements show the first 10 elements
+          followed by an ellipsis.
+        - 1D tensors with 10 or fewer elements are fully listed.
+        - Multi-dimensional tensors display their shape.
+    - Dictionaries: Each key-value pair is included in the output with the key
+      as a prefix.
+    - Lists and tuples: Flattened and included in the output. Empty lists/tuples are represented
+      by an empty string.
+    - None values: Represented by an empty string.
+
+    Args:
+        data: The input data to be flattened and formatted. It can be a nested
+            combination of dictionaries, lists, tuples, numbers, and PyTorch
+            tensors.
+
+    Returns:
+        A list of formatted strings, each representing a part of the input data
+        structure.
+    """
+    formatted_items: List[str] = []
+
+    def format_item(item: Any, prefix: str = "") -> Optional[str]:
+        if isinstance(item, numbers.Number):
+            return f"{prefix}{item:.4f}"
+        elif torch.is_tensor(item):
+            if item.dim() == 0:
+                return f"{prefix}{item.item():.4f}"  # Format scalar tensor without brackets
+            elif item.dim() == 1 and item.size(0) > 10:
+                return f"{prefix}[" + ", ".join(f"{x.item():.4f}" for x in item[:10]) + ", ...]"
+            elif item.dim() == 1:
+                return f"{prefix}[" + ", ".join(f"{x.item():.4f}" for x in item) + "]"
+            else:
+                return f"{prefix}Shape{list(item.shape)}"
+        elif isinstance(item, dict):
+            for key, value in item.items():
+                formatted_value = format_item(value, f"{key}: ")
+                if formatted_value is not None:
+                    formatted_items.append(formatted_value)
+        elif isinstance(item, (list, tuple)):
+            if not item:
+                if prefix:
+                    formatted_items.append(f"{prefix}")
+            else:
+                values = [format_item(x) for x in item]
+                values_str = [v for v in values if v is not None]
+                if values_str:
+                    formatted_items.append(f"{prefix}" + ", ".join(values_str))
+        elif item is None:
+            if prefix:
+                formatted_items.append(f"{prefix}")
+        return None
+
+    # Directly handle single numeric values
+    if isinstance(data, numbers.Number):
+        return [f"{data:.4f}"]
+
+    format_item(data)
+    return formatted_items
+
+
 class _CollectionItem:
     types_as_collection_item: Tuple = (int, float, torch.Tensor)
 
@@ -163,6 +241,7 @@ def setup_logger(
     filepath: Optional[str] = None,
     distributed_rank: Optional[int] = None,
     reset: bool = False,
+    encoding: Optional[str] = "utf-8",
 ) -> logging.Logger:
     """Setups logger: name, level, format etc.
 
@@ -175,6 +254,7 @@ def setup_logger(
         distributed_rank: Optional, rank in distributed configuration to avoid logger setup for workers.
             If None, distributed_rank is initialized to the rank of process.
         reset: if True, reset an existing logger rather than keep format, handlers, and level.
+        encoding: open the file with the encoding. By default, 'utf-8'.
 
     Returns:
         logging.Logger
@@ -228,6 +308,9 @@ def setup_logger(
 
     .. versionchanged:: 0.4.5
         Added ``reset`` parameter.
+
+    .. versionchanged:: 0.5.1
+        Argument ``encoding`` added to correctly handle special characters in the file, default "utf-8".
     """
     # check if the logger already exists
     existing = name is None or name in logging.root.manager.loggerDict
@@ -265,7 +348,7 @@ def setup_logger(
         logger.addHandler(ch)
 
         if filepath is not None:
-            fh = logging.FileHandler(filepath)
+            fh = logging.FileHandler(filepath, encoding=encoding)
             fh.setLevel(level)
             fh.setFormatter(formatter)
             logger.addHandler(fh)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index b387dd03a652..d475e556cdff 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,7 +3,9 @@ numpy
 pytest
 pytest-cov
 pytest-xdist
+pytest-timeout
 dill
+filelock
 setuptools
 # Test contrib dependencies
 scipy
@@ -25,7 +27,7 @@ scikit-image
 py-rouge
 # temporary fix for python=3.12 and v3.8.1
 # nltk
-git+https://github.com/nltk/nltk
+git+https://github.com/nltk/nltk@aba99c8
 # Examples dependencies
 pandas
 gymnasium
diff --git a/tests/common_test_functionality.sh b/tests/common_test_functionality.sh
new file mode 100644
index 000000000000..6e60947f927b
--- /dev/null
+++ b/tests/common_test_functionality.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Will catch exit code 5 when tests are deselected from previous passing run
+# (relevent for --last-failed-no-failures none)
+last_failed_no_failures_code=5
+
+#  functions shared across test files
+run_tests() {
+    # Set defaults
+    local core_args="-vvv tests/ignite"
+    local cache_dir=".unknown-cache"
+    local skip_distrib_tests=1
+    local match_tests_expression=""
+    local trap_deselected_exit_code=1
+    local use_last_failed=0
+    local use_coverage=0
+    local world_size=0
+    # Always clean up pytest.ini
+    trap 'rm -f pytest.ini' RETURN
+    # Parse arguments
+    while [[ $# -gt 0 ]]
+    do
+        key="$1"
+        case $key in
+            --core_args)
+            core_args="$2"
+            shift
+            shift
+            ;;
+            --cache_dir)
+            cache_dir="$2"
+            shift
+            shift
+            ;;
+            --skip_distrib_tests)
+            skip_distrib_tests="$2"
+            shift
+            shift
+            ;;
+            --match_tests_expression)
+            match_tests_expression="$2"
+            shift
+            shift
+            ;;
+            --trap_deselected_exit_code)
+            trap_deselected_exit_code="$2"
+            shift
+            shift
+            ;;
+            --use_last_failed)
+            use_last_failed="$2"
+            shift
+            shift
+            ;;
+            --use_coverage)
+            use_coverage="$2"
+            shift
+            shift
+            ;;
+            --world_size)
+            world_size="$2"
+            shift
+            shift
+            ;;
+            *)
+            echo "Error: Unknown argument $key"
+            exit 1
+            shift
+            ;;
+        esac
+    done
+
+    if ! command -v pytest &> /dev/null
+    then
+        echo "pytest could not be found"
+        echo "The path is: ${PATH}"
+        exit 1
+    fi
+
+
+    if [ "${skip_distrib_tests}" -eq "1" ]; then
+        # can be overwritten by core_args
+        skip_distrib_opt="-m 'not distributed and not tpu and not multinode_distributed'"
+    else
+        skip_distrib_opt=""
+    fi
+
+
+    echo [pytest] > pytest.ini ; echo "cache_dir=${cache_dir}" >> pytest.ini
+
+    # Assemble options for the pytest command
+    pytest_args="${skip_distrib_opt} ${core_args} --treat-unrun-as-failed -k '${match_tests_expression}'"
+    if [ "${use_last_failed:-0}" -eq "1" ] && [ -d "${cache_dir}" ]; then
+        pytest_args="--last-failed --last-failed-no-failures none ${pytest_args}"
+    fi
+    if [ "${use_coverage}" -eq "1" ]; then
+        pytest_args="--cov ignite --cov-append --cov-report term-missing --cov-report xml ${pytest_args}"
+    fi
+    if [ ! "${world_size}" -eq "0" ]; then
+        export WORLD_SIZE="${world_size}"
+        pytest_args="--dist=each --tx ${WORLD_SIZE}*popen//python=python ${pytest_args}"
+    fi
+
+    # Run the command
+    if [ "$trap_deselected_exit_code" -eq "1" ]; then
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}" || { exit_code=$?; if [ "$exit_code" -eq ${last_failed_no_failures_code} ]; then echo "All tests deselected"; else exit $exit_code; fi; }
+    else
+        CUDA_VISIBLE_DEVICES="" eval "pytest ${pytest_args}"
+    fi
+}
diff --git a/tests/ignite/__init__.py b/tests/ignite/__init__.py
index d553c222e58b..8f84e2e74b99 100644
--- a/tests/ignite/__init__.py
+++ b/tests/ignite/__init__.py
@@ -3,3 +3,14 @@
 
 def cpu_and_maybe_cuda():
     return ("cpu",) + (("cuda",) if torch.cuda.is_available() else ())
+
+
+def is_mps_available_and_functional():
+    if not torch.backends.mps.is_available():
+        return False
+    try:
+        # Try to allocate a small tensor on the MPS device
+        torch.tensor([1.0], device="mps")
+        return True
+    except RuntimeError:
+        return False
diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py
index caf92e6e7ad2..d5546a75bae5 100644
--- a/tests/ignite/conftest.py
+++ b/tests/ignite/conftest.py
@@ -1,8 +1,10 @@
 import functools
 import os
 import shutil
+import signal
 import sys
 import tempfile
+import threading
 import time
 from pathlib import Path
 
@@ -13,6 +15,59 @@
 import ignite.distributed as idist
 
 
+def pytest_addoption(parser):
+    """
+    Add custom command line options for the ignite test suite here.
+    See:
+    This function is a pytest hook (due to its name) and is *"automatically"
+    executed at the start of a test run
+    https://docs.pytest.org/en/latest/reference/reference.html#initialization-hooks
+
+    * "automatically" is true provided this conftest.py file is the
+    root directory. See:
+    https://docs.pytest.org/en/latest/reference/customize.html#initialization-determining-rootdir-and-configfile
+    """
+    parser.addoption(
+        "--treat-unrun-as-failed",
+        action="store_true",
+        help="""
+        If a session is interrupted, treat the unrun tests as failed so that a
+        rerun with --last-failed runs any tests that have not passed or been
+        skipped. Note that if all tests in a module have been skipped, the
+        module will be skipped for all subsequent runs.
+        """,
+    )
+
+
+def pytest_configure(config):
+    """
+    This function is a pytest hook (due to its name) and is run after command
+    line parsing is complete in order to configure the test session.
+    """
+    config.addinivalue_line("markers", "distributed: run distributed")
+    config.addinivalue_line("markers", "multinode_distributed: distributed")
+    config.addinivalue_line("markers", "tpu: run on tpu")
+    if config.option.treat_unrun_as_failed:
+        unrun_tracker = UnrunTracker()
+        config.pluginmanager.register(unrun_tracker, "unrun_tracker_plugin")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def term_handler():
+    """
+    This allows the pytest session to be terminated upon retries on CI. It may
+    be worth using this fixture solely in that context. For a discussion on
+    whether sigterm should be ignored and why pytest usually ignores it see:
+    https://github.com/pytest-dev/pytest/issues/5243
+    """
+    if threading.current_thread() is threading.main_thread() and hasattr(signal, "SIGTERM"):
+        orig = signal.signal(signal.SIGTERM, signal.getsignal(signal.SIGINT))
+        yield
+        signal.signal(signal.SIGTERM, orig)
+    else:
+        yield  # Just pass through if SIGTERM isn't supported or we are not in the main thread
+
+
 @pytest.fixture(
     params=[
         "cpu",
@@ -195,7 +250,7 @@ def distributed_context_single_node_gloo(local_rank, world_size):
         "world_size": world_size,
         "rank": local_rank,
         "init_method": init_method,
-        "timeout": timedelta(seconds=60),
+        "timeout": timedelta(seconds=30),
     }
     yield _create_dist_context(dist_info, local_rank)
     _destroy_dist_context()
@@ -423,7 +478,7 @@ def distributed(request, local_rank, world_size):
             dist_info["backend"] = "gloo"
             from datetime import timedelta
 
-            dist_info["timeout"] = timedelta(seconds=60)
+            dist_info["timeout"] = timedelta(seconds=30)
         yield _create_dist_context(dist_info, local_rank)
         _destroy_dist_context()
         if temp_file:
@@ -441,8 +496,52 @@ def distributed(request, local_rank, world_size):
         raise RuntimeError(f"Invalid parameter value for `distributed` fixture, given {request.param}")
 
 
+class UnrunTracker:
+    """
+    Keeps track of unrun tests to improve the user experience when using the
+    "--last-failed" pytest option and a test session is interrupted. This is
+    particularly useful on CI when rerunning "failing" tests where the failure
+    was due to a deadlock and many tests weren't actually run so they didn't
+    actually fail. This is a pytest plugin that implements some standard hooks
+    to modify the test session. Its functionality can be added to a test session
+    by registering it with the pytest plugin manager.
+    """
+
+    def __init__(self):
+        self.unrun_tests = []
+
+    def pytest_collection_finish(self, session):
+        # At the end of the collection, add all items to the unrun_tests list
+        self.unrun_tests.extend(session.items)
+
+    def pytest_runtest_teardown(self, item):
+        if item in self.unrun_tests:
+            self.unrun_tests.remove(item)
+
+    def record_unrun_as_failed(self, session, exitstatus):
+        # Get current lastfailed entries (if any)
+        lastfailed = session.config.cache.get("cache/lastfailed", {})
+
+        # Add unrun tests to lastfailed
+        for test in self.unrun_tests:
+            lastfailed[test.nodeid] = True
+
+        # Update the cache with the new lastfailed
+        session.config.cache.set("cache/lastfailed", lastfailed)
+
+
 @pytest.hookimpl
 def pytest_pyfunc_call(pyfuncitem: pytest.Function) -> None:
+    if any(fx in pyfuncitem.fixturenames for fx in ["distributed", "multinode_distributed"]):
+        # Run distributed tests on a single worker to avoid RACE conditions
+        # This requires that the --dist=loadgroup option be passed to pytest.
+        pyfuncitem.add_marker(pytest.mark.xdist_group("distributed"))
+        # Add timeouts to prevent hanging
+        if "tpu" in pyfuncitem.fixturenames:
+            pyfuncitem.add_marker(pytest.mark.timeout(60))
+        else:
+            pyfuncitem.add_marker(pytest.mark.timeout(45))
+
     if pyfuncitem.stash.get(is_horovod_stash_key, False):
 
         def testfunc_wrapper(test_func, **kwargs):
@@ -492,3 +591,16 @@ def xla_worker(index, fn):
                 assert ex_.code == 0, "Didn't successfully exit in XLA test"
 
         pyfuncitem.obj = functools.partial(testfunc_wrapper, pyfuncitem.obj)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """
+    Any functionality that should be run at the end of the session should be
+    added here.
+    This is a pytest hook (due to its name) and is called after the whole test
+    run finished, right before returning the exit status to the system.
+    """
+    # If requested by the user, track all unrun tests and add them to the lastfailed cache
+    if session.config.option.treat_unrun_as_failed:
+        unrun_tracker = session.config.pluginmanager.get_plugin("unrun_tracker_plugin")
+        unrun_tracker.record_unrun_as_failed(session, exitstatus)
diff --git a/tests/ignite/distributed/comp_models/test_base.py b/tests/ignite/distributed/comp_models/test_base.py
index c8041c6dc337..4c151d8d0b08 100644
--- a/tests/ignite/distributed/comp_models/test_base.py
+++ b/tests/ignite/distributed/comp_models/test_base.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
 
-from ignite.distributed.comp_models.base import _SerialModel, _torch_version_le_112, ComputationModel
+from ignite.distributed.comp_models.base import _SerialModel, _torch_version_gt_112, ComputationModel
 
 
 def test_serial_model():
@@ -16,7 +16,7 @@ def test_serial_model():
     assert model.get_node_rank() == 0
     if torch.cuda.is_available():
         assert model.device().type == "cuda"
-    elif _torch_version_le_112 and torch.backends.mps.is_available():
+    elif _torch_version_gt_112 and torch.backends.mps.is_available():
         assert model.device().type == "mps"
     else:
         assert model.device().type == "cpu"
diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py
index 761e328944c4..2ecc3404c907 100644
--- a/tests/ignite/distributed/test_auto.py
+++ b/tests/ignite/distributed/test_auto.py
@@ -12,6 +12,8 @@
 
 import ignite.distributed as idist
 from ignite.distributed.auto import auto_dataloader, auto_model, auto_optim, DistributedProxySampler
+from ignite.distributed.comp_models.base import _torch_version_gt_112
+from tests.ignite import is_mps_available_and_functional
 
 
 class DummyDS(Dataset):
@@ -179,6 +181,10 @@ def _test_auto_model_optimizer(ws, device):
         assert optimizer.backward_passes_per_step == backward_passes_per_step
 
 
+@pytest.mark.skipif(
+    (not _torch_version_gt_112) or (torch.backends.mps.is_available() and not is_mps_available_and_functional()),
+    reason="Skip if MPS not functional",
+)
 def test_auto_methods_no_dist():
     _test_auto_dataloader(1, 1, batch_size=1)
     _test_auto_dataloader(1, 1, batch_size=10, num_workers=2)
diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py
index b12e2acf1c26..eac7ffe2e06c 100644
--- a/tests/ignite/distributed/test_launcher.py
+++ b/tests/ignite/distributed/test_launcher.py
@@ -8,7 +8,9 @@
 from packaging.version import Version
 
 import ignite.distributed as idist
+from ignite.distributed.comp_models.base import _torch_version_gt_112
 from ignite.distributed.utils import has_hvd_support, has_native_dist_support, has_xla_support
+from tests.ignite import is_mps_available_and_functional
 
 
 def test_parallel_wrong_inputs():
@@ -54,6 +56,10 @@ def execute(cmd, env=None):
     return str(process.stdout.read()) + str(process.stderr.read())
 
 
+@pytest.mark.skipif(
+    (not _torch_version_gt_112) or (torch.backends.mps.is_available() and not is_mps_available_and_functional()),
+    reason="Skip if MPS not functional",
+)
 def test_check_idist_parallel_no_dist(exec_filepath):
     cmd = [sys.executable, "-u", exec_filepath]
     out = execute(cmd)
diff --git a/tests/ignite/distributed/utils/test_serial.py b/tests/ignite/distributed/utils/test_serial.py
index fdbf26e83608..df2d6742b54a 100644
--- a/tests/ignite/distributed/utils/test_serial.py
+++ b/tests/ignite/distributed/utils/test_serial.py
@@ -1,7 +1,7 @@
 import torch
 
 import ignite.distributed as idist
-from ignite.distributed.comp_models.base import _torch_version_le_112
+from ignite.distributed.comp_models.base import _torch_version_gt_112
 from tests.ignite.distributed.utils import (
     _sanity_check,
     _test_distrib__get_max_length,
@@ -18,7 +18,7 @@ def test_no_distrib(capsys):
     assert idist.backend() is None
     if torch.cuda.is_available():
         assert idist.device().type == "cuda"
-    elif _torch_version_le_112 and torch.backends.mps.is_available():
+    elif _torch_version_gt_112 and torch.backends.mps.is_available():
         assert idist.device().type == "mps"
     else:
         assert idist.device().type == "cpu"
@@ -41,7 +41,7 @@ def test_no_distrib(capsys):
     assert "ignite.distributed.utils INFO: backend: None" in out[-1]
     if torch.cuda.is_available():
         assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
-    elif _torch_version_le_112 and torch.backends.mps.is_available():
+    elif _torch_version_gt_112 and torch.backends.mps.is_available():
         assert "ignite.distributed.utils INFO: device: mps" in out[-1]
     else:
         assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py
index 31ca43f4bbf7..54938167601a 100644
--- a/tests/ignite/engine/test_create_supervised.py
+++ b/tests/ignite/engine/test_create_supervised.py
@@ -12,7 +12,7 @@
 from torch.optim import SGD
 
 import ignite.distributed as idist
-from ignite.distributed.comp_models.base import _torch_version_le_112
+from ignite.distributed.comp_models.base import _torch_version_gt_112
 from ignite.engine import (
     _check_arg,
     create_supervised_evaluator,
@@ -25,6 +25,8 @@
 )
 from ignite.metrics import MeanSquaredError
 
+from tests.ignite import is_mps_available_and_functional
+
 
 class DummyModel(torch.nn.Module):
     def __init__(self, output_as_list=False):
@@ -314,7 +316,8 @@ def _test_create_supervised_evaluator(
             # This is broken in 1.6.0 but will be probably fixed with 1.7.0
             err_msg_1 = "Expected all tensors to be on the same device"
             err_msg_2 = "Placeholder storage has not been allocated on MPS device"
-            with pytest.raises(RuntimeError, match=f"({err_msg_1}|{err_msg_2})"):
+            err_msg_3 = "Tensor for argument weight is on cpu but expected on mps"
+            with pytest.raises(RuntimeError, match=f"({err_msg_1}|{err_msg_2}|{err_msg_3})"):
                 evaluator.run(data)
 
 
@@ -485,7 +488,7 @@ def test_create_supervised_trainer_on_cuda():
     _test_create_mocked_supervised_trainer(model_device=model_device, trainer_device=trainer_device)
 
 
-@pytest.mark.skipif(not (_torch_version_le_112 and torch.backends.mps.is_available()), reason="Skip if no MPS")
+@pytest.mark.skipif(not (_torch_version_gt_112 and is_mps_available_and_functional()), reason="Skip if no MPS")
 def test_create_supervised_trainer_on_mps():
     model_device = trainer_device = "mps"
     _test_create_supervised_trainer_wrong_accumulation(model_device=model_device, trainer_device=trainer_device)
@@ -666,14 +669,14 @@ def test_create_supervised_evaluator_on_cuda_with_model_on_cpu():
     _test_mocked_supervised_evaluator(evaluator_device="cuda")
 
 
-@pytest.mark.skipif(not (_torch_version_le_112 and torch.backends.mps.is_available()), reason="Skip if no MPS")
+@pytest.mark.skipif(not (_torch_version_gt_112 and is_mps_available_and_functional()), reason="Skip if no MPS")
 def test_create_supervised_evaluator_on_mps():
     model_device = evaluator_device = "mps"
     _test_create_supervised_evaluator(model_device=model_device, evaluator_device=evaluator_device)
     _test_mocked_supervised_evaluator(model_device=model_device, evaluator_device=evaluator_device)
 
 
-@pytest.mark.skipif(not (_torch_version_le_112 and torch.backends.mps.is_available()), reason="Skip if no MPS")
+@pytest.mark.skipif(not (_torch_version_gt_112 and is_mps_available_and_functional()), reason="Skip if no MPS")
 def test_create_supervised_evaluator_on_mps_with_model_on_cpu():
     _test_create_supervised_evaluator(evaluator_device="mps")
     _test_mocked_supervised_evaluator(evaluator_device="mps")
diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py
index 130212426504..d1cc017bf916 100644
--- a/tests/ignite/engine/test_engine.py
+++ b/tests/ignite/engine/test_engine.py
@@ -1026,47 +1026,6 @@ def switch_dataloader():
 
         trainer.run(data1, max_epochs=10)
 
-    def test_run_with_max_iters(self):
-        max_iters = 8
-        engine = Engine(lambda e, b: 1)
-        engine.run([0] * 20, max_iters=max_iters)
-        assert engine.state.iteration == max_iters
-        assert engine.state.max_iters == max_iters
-
-    def test_run_with_max_iters_greater_than_epoch_length(self):
-        max_iters = 73
-        engine = Engine(lambda e, b: 1)
-        engine.run([0] * 20, max_iters=max_iters)
-        assert engine.state.iteration == max_iters
-
-    def test_run_with_invalid_max_iters_and_max_epoch(self):
-        max_iters = 12
-        max_epochs = 2
-        engine = Engine(lambda e, b: 1)
-        with pytest.raises(
-            ValueError,
-            match=r"Arguments max_iters and max_epochs are mutually exclusive."
-            "Please provide only max_epochs or max_iters.",
-        ):
-            engine.run([0] * 20, max_iters=max_iters, max_epochs=max_epochs)
-
-    def test_epoch_events_fired_max_iters(self):
-        max_iters = 32
-        engine = Engine(lambda e, b: 1)
-
-        @engine.on(Events.EPOCH_COMPLETED)
-        def fired_event(engine):
-            assert engine.state.iteration % engine.state.epoch_length == 0
-
-        engine.run([0] * 10, max_iters=max_iters)
-
-    def test_is_done_with_max_iters(self):
-        state = State(iteration=100, epoch=1, max_epochs=3, epoch_length=100, max_iters=250)
-        assert not Engine._is_done(state)
-
-        state = State(iteration=250, epoch=1, max_epochs=3, epoch_length=100, max_iters=250)
-        assert Engine._is_done(state)
-
     @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
     def test_batch_is_released_before_new_one_is_loaded_on_cuda(self):
         torch.cuda.empty_cache()
diff --git a/tests/ignite/handlers/conftest.py b/tests/ignite/handlers/conftest.py
index 9d7bb999463b..79ac0809698e 100644
--- a/tests/ignite/handlers/conftest.py
+++ b/tests/ignite/handlers/conftest.py
@@ -1,58 +1,41 @@
-import random
+import subprocess
+import time
 from pathlib import Path
 from unittest.mock import Mock
 
 import pytest
 import torch
+from visdom import Visdom
+from visdom.server.build import download_scripts
 
-vd_hostname = None
-vd_port = None
-vd_server_process = None
 
-
-@pytest.fixture()
+@pytest.fixture(scope="session")
 def visdom_server():
     # Start Visdom server once and stop it with visdom_server_stop
-    global vd_hostname, vd_port, vd_server_process
-
-    if vd_server_process is None:
-        import subprocess
-        import time
-
-        from visdom import Visdom
-        from visdom.server.build import download_scripts
-
+    vd_hostname = "localhost"
+    if not (Path.home() / ".visdom").exists():
         (Path.home() / ".visdom").mkdir(exist_ok=True)
         download_scripts()
+    vis = None
 
-        vd_hostname = "localhost"
-        vd_port = random.randint(8089, 8887)
-
+    vd_port = 29777
+    vd_server_process = subprocess.Popen(
+        ["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
+    )
+    time.sleep(2)
+    for ii in range(5):
         try:
+            time.sleep(1)
             vis = Visdom(server=vd_hostname, port=vd_port, raise_exceptions=True)
+            break
         except ConnectionError:
-            pass
-
-        vd_server_process = subprocess.Popen(
-            ["python", "-m", "visdom.server", "--hostname", vd_hostname, "-port", str(vd_port)]
-        )
-        time.sleep(5)
-
-        vis = Visdom(server=vd_hostname, port=vd_port)
-        assert vis.check_connection()
-        vis.close()
+            continue
 
+    assert vis and vis.check_connection()
     yield (vd_hostname, vd_port)
-
-
-@pytest.fixture()
-def visdom_server_stop():
-    yield None
-
-    import time
-
-    vd_server_process.kill()
-    time.sleep(2)
+    # Trying to clean up slows things down and sometimes causes hangs.
+    # vis.close()
+    # vd_server_process.kill()
 
 
 @pytest.fixture
diff --git a/tests/ignite/handlers/test_fbresearch_logger.py b/tests/ignite/handlers/test_fbresearch_logger.py
index b85bdcf2794e..728c97870e09 100644
--- a/tests/ignite/handlers/test_fbresearch_logger.py
+++ b/tests/ignite/handlers/test_fbresearch_logger.py
@@ -3,9 +3,13 @@
 from unittest.mock import MagicMock
 
 import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
 
-from ignite.engine import Engine, Events
-from ignite.handlers.fbresearch_logger import FBResearchLogger  # Adjust the import path as necessary
+from ignite.engine import create_supervised_trainer, Engine, Events
+from ignite.handlers.fbresearch_logger import FBResearchLogger
+from ignite.utils import setup_logger
 
 
 @pytest.fixture
@@ -56,3 +60,47 @@ def test_output_formatting(mock_engine, fb_research_logger, output, expected_pat
 
     actual_output = fb_research_logger.logger.info.call_args_list[0].args[0]
     assert re.search(expected_pattern, actual_output)
+
+
+def test_logger_type_support():
+    model = nn.Linear(10, 5)
+    opt = optim.SGD(model.parameters(), lr=0.001)
+    criterion = nn.CrossEntropyLoss()
+
+    data = [(torch.rand(4, 10), torch.randint(0, 5, size=(4,))) for _ in range(100)]
+
+    trainer = create_supervised_trainer(model, opt, criterion)
+
+    logger = setup_logger("trainer", level=logging.INFO)
+    logger = FBResearchLogger(logger=logger, show_output=True)
+    logger.attach(trainer, name="Train", every=20, optimizer=opt)
+
+    trainer.run(data, max_epochs=4)
+    trainer.state.output = {"loss": 4.2}
+    trainer.fire_event(Events.ITERATION_COMPLETED)
+    trainer.state.output = "4.2"
+    trainer.fire_event(Events.ITERATION_COMPLETED)
+    trainer.state.output = [4.2, 4.2]
+    trainer.fire_event(Events.ITERATION_COMPLETED)
+    trainer.state.output = (4.2, 4.2)
+    trainer.fire_event(Events.ITERATION_COMPLETED)
+
+
+def test_fbrlogger_with_output_transform(mock_logger):
+    trainer = Engine(lambda e, b: 42)
+    fbr = FBResearchLogger(logger=mock_logger, show_output=True)
+    fbr.attach(trainer, "Training", output_transform=lambda x: {"loss": x})
+    trainer.run(data=[10], epoch_length=1, max_epochs=1)
+    assert "loss: 42.0000" in fbr.logger.info.call_args_list[-2].args[0]
+
+
+def test_fbrlogger_with_state_attrs(mock_logger):
+    trainer = Engine(lambda e, b: 42)
+    fbr = FBResearchLogger(logger=mock_logger, show_output=True)
+    fbr.attach(trainer, "Training", state_attributes=["alpha", "beta", "gamma"])
+    trainer.state.alpha = 3.899
+    trainer.state.beta = torch.tensor(12.21)
+    trainer.state.gamma = torch.tensor([21.0, 6.0])
+    trainer.run(data=[10], epoch_length=1, max_epochs=1)
+    attrs = "alpha: 3.8990 beta: 12.2100 gamma: [21.0000, 6.0000]"
+    assert attrs in fbr.logger.info.call_args_list[-2].args[0]
diff --git a/tests/ignite/handlers/test_lr_finder.py b/tests/ignite/handlers/test_lr_finder.py
index e12d951dfbf2..b64b3ab8527b 100644
--- a/tests/ignite/handlers/test_lr_finder.py
+++ b/tests/ignite/handlers/test_lr_finder.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from unittest.mock import MagicMock
 
+import filelock
+
 import matplotlib
 import pytest
 import torch
@@ -144,16 +146,27 @@ def dataloader_plot():
 
 
 @pytest.fixture
-def mnist_dataloader():
+def mnist_dataloader(tmp_path_factory):
     from torch.utils.data import DataLoader
     from torchvision.datasets import MNIST
     from torchvision.transforms import Compose, Normalize, ToTensor
 
     data_transform = Compose([ToTensor(), Normalize((0.1307,), (0.3081,))])
 
-    train_loader = DataLoader(
-        MNIST(download=True, root="/tmp", transform=data_transform, train=True), batch_size=256, shuffle=True
-    )
+    root_tmp_dir = tmp_path_factory.getbasetemp().parent
+    while True:
+        try:
+            with filelock.FileLock(root_tmp_dir / "mnist_download.lock", timeout=0.2) as fn:
+                fn.acquire()
+                train_loader = DataLoader(
+                    MNIST(download=True, root="/tmp", transform=data_transform, train=True),
+                    batch_size=256,
+                    shuffle=True,
+                )
+                fn.release()
+                break
+        except filelock._error.Timeout:
+            pass
 
     yield train_loader
 
@@ -344,7 +357,7 @@ def test_num_iter_is_not_enough(lr_finder, to_save, dummy_engine, dataloader):
             trainer_with_finder.run(dataloader)
         assert_output_sizes(lr_finder, dummy_engine)
         assert dummy_engine.state.iteration != len(dataloader)
-        assert dummy_engine.state.iteration == 150
+        assert dummy_engine.state.iteration == 150 + 1
 
 
 def test_detach_terminates(lr_finder, to_save, dummy_engine, dataloader):
diff --git a/tests/ignite/handlers/test_tqdm_logger.py b/tests/ignite/handlers/test_tqdm_logger.py
index 0f9a501ebf82..cae59ac15b4e 100644
--- a/tests/ignite/handlers/test_tqdm_logger.py
+++ b/tests/ignite/handlers/test_tqdm_logger.py
@@ -33,9 +33,9 @@ def update_fn(engine, batch):
 def test_pbar_errors():
     with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires tqdm to be installed"):
         with patch.dict("sys.modules", {"tqdm.autonotebook": None}):
-            ProgressBar()
+            ProgressBar(ncols=80)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     with pytest.raises(ValueError, match=r"Logging event abc is not in allowed"):
         pbar.attach(Engine(lambda e, b: None), event_name=Namespace(name="abc"))
 
@@ -45,7 +45,7 @@ def test_pbar(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, ["a"])
 
     engine.run(loader, max_epochs=n_epochs)
@@ -55,9 +55,9 @@ def test_pbar(capsys):
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
     if get_tqdm_version() < Version("4.49.0"):
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<00:00]"
+        expected = "Epoch 8 -*-     , a=1 [00:00<00:00]"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<?]"
+        expected = "Epoch [2/2]: [1/2]  50%|████████████████████▌                    , a=1 [00:00<?]"
     assert err[-1] == expected
 
 
@@ -69,7 +69,7 @@ def test_pbar_file(tmp_path):
     file_path = tmp_path / "temp.txt"
     file = open(str(file_path), "w+")
 
-    pbar = ProgressBar(file=file)
+    pbar = ProgressBar(file=file, ncols=80)
     pbar.attach(engine, ["a"])
     engine.run(loader, max_epochs=n_epochs)
 
@@ -81,12 +81,12 @@ def test_pbar_file(tmp_path):
     if get_tqdm_version() < Version("4.49.0"):
         expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<00:00]\n"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<?]\n"
+        expected = "Epoch [2/2]: [1/2]  50%|████████████████████▌                    , a=1 [00:00<?]\n"
     assert lines[-2] == expected
 
 
 def test_pbar_log_message(capsys):
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
 
     pbar.log_message("test")
 
@@ -102,7 +102,7 @@ def test_pbar_log_message_file(tmp_path):
     file_path = tmp_path / "temp.txt"
     file = open(str(file_path), "w+")
 
-    pbar = ProgressBar(file=file)
+    pbar = ProgressBar(file=file, ncols=80)
     pbar.log_message("test")
 
     file.close()  # Force a flush of the buffer. file.flush() does not work.
@@ -116,7 +116,7 @@ def test_pbar_log_message_file(tmp_path):
 
 def test_attach_fail_with_string():
     engine = Engine(update_fn)
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
 
     with pytest.raises(TypeError):
         pbar.attach(engine, "a")
@@ -129,7 +129,7 @@ def test_pbar_batch_indeces(capsys):
     def print_iter(_):
         print("iteration: ", engine.state.iteration)
 
-    ProgressBar(persist=True).attach(engine)
+    ProgressBar(persist=True, ncols=80).attach(engine)
     engine.run(list(range(4)), max_epochs=1)
 
     captured = capsys.readouterr()
@@ -154,7 +154,7 @@ def step(engine, batch):
 
     RunningAverage(alpha=0.5, output_transform=lambda x: x).attach(trainer, "batchloss")
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(trainer, metric_names=["batchloss"])
 
     trainer.run(data=data, max_epochs=1)
@@ -165,9 +165,9 @@ def step(engine, batch):
     err = list(filter(None, err))
     actual = err[-1]
     if get_tqdm_version() < Version("4.49.0"):
-        expected = "Iteration: [1/2]  50%|█████     , batchloss=0.5 [00:00<00:00]"
+        expected = "Iteration: [1/2]  50%|██████     , batchloss=0.5 [00:00<00:00]"
     else:
-        expected = "Iteration: [1/2]  50%|█████     , batchloss=0.5 [00:00<?]"
+        expected = "Iteration: [1/2]  50%|████████████████▌                , batchloss=0.5 [00:00<?]"
     assert actual == expected
 
 
@@ -187,7 +187,7 @@ def step(engine, batch):
     RunningAverage(alpha=0.5, output_transform=lambda x: x[0]).attach(trainer, "batchloss")
     RunningAverage(alpha=0.5, output_transform=lambda x: x[1]).attach(trainer, "another batchloss")
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(trainer, metric_names="all")
 
     trainer.run(data=data, max_epochs=1)
@@ -198,7 +198,7 @@ def step(engine, batch):
     err = list(filter(None, err))
     actual = err[-1]
     if get_tqdm_version() < Version("4.49.0"):
-        expected = "Iteration: [1/2]  50%|█████     , batchloss=0.5, another batchloss=1.5 [00:00<00:00]"
+        expected = "Iteration: [1/2]  50%|███   , batchloss=0.5, another batchloss=1.5 [00:00<00:00]"
     else:
         expected = "Iteration: [1/2]  50%|█████     , batchloss=0.5, another batchloss=1.5 [00:00<?]"
     assert actual == expected
@@ -220,7 +220,7 @@ def step(engine, batch):
 
     RunningAverage(alpha=0.5, output_transform=lambda x: x).attach(trainer, "batchloss")
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(trainer, metric_names=["batchloss"], state_attributes=["alpha", "beta", "gamma"])
 
     trainer.run(data=data, max_epochs=1)
@@ -235,9 +235,7 @@ def step(engine, batch):
             "Iteration: [1/2]  50%|█████     , batchloss=0.5, alpha=3.9, beta=12.2, gamma_0=21, gamma_1=6 [00:00<00:00]"
         )
     else:
-        expected = (
-            "Iteration: [1/2]  50%|█████     , batchloss=0.5, alpha=3.9, beta=12.2, gamma_0=21, gamma_1=6 [00:00<?]"
-        )
+        expected = "Iteration: [1/2]  50%|▌, batchloss=0.5, alpha=3.9, beta=12.2, gamma_0=21, gamma_"
     assert actual == expected
 
 
@@ -246,7 +244,7 @@ def test_pbar_no_metric_names(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine)
 
     engine.run(loader, max_epochs=n_epochs)
@@ -257,9 +255,9 @@ def test_pbar_no_metric_names(capsys):
     err = list(filter(None, err))
     actual = err[-1]
     if get_tqdm_version() < Version("4.49.0"):
-        expected = "Epoch [2/2]: [1/2]  50%|█████      [00:00<00:00]"
+        expected = "Epoch [2/2]: [1/2]  50%|██████████            [00:00<00:00]"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████      [00:00<?]"
+        expected = "Epoch [2/2]: [1/2]  50%|███████████████████████                        [00:00<?]"
     assert actual == expected
 
 
@@ -268,7 +266,7 @@ def test_pbar_with_output(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, output_transform=lambda x: {"a": x})
 
     engine.run(loader, max_epochs=n_epochs)
@@ -280,13 +278,13 @@ def test_pbar_with_output(capsys):
     if get_tqdm_version() < Version("4.49.0"):
         expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<00:00]"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , a=1 [00:00<?]"
+        expected = "Epoch [2/2]: [1/2]  50%|████████████████████▌                    , a=1 [00:00<?]"
     assert err[-1] == expected
 
 
 def test_pbar_fail_with_non_callable_transform():
     engine = Engine(update_fn)
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
 
     with pytest.raises(TypeError):
         pbar.attach(engine, output_transform=1)
@@ -297,7 +295,7 @@ def test_pbar_with_scalar_output(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, output_transform=lambda x: x)
 
     engine.run(loader, max_epochs=n_epochs)
@@ -309,7 +307,7 @@ def test_pbar_with_scalar_output(capsys):
     if get_tqdm_version() < Version("4.49.0"):
         expected = "Epoch [2/2]: [1/2]  50%|█████     , output=1 [00:00<00:00]"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , output=1 [00:00<?]"
+        expected = "Epoch [2/2]: [1/2]  50%|██████████████████                  , output=1 [00:00<?]"
     assert err[-1] == expected
 
 
@@ -318,7 +316,7 @@ def test_pbar_with_str_output(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, output_transform=lambda x: "red")
 
     engine.run(loader, max_epochs=n_epochs)
@@ -330,7 +328,7 @@ def test_pbar_with_str_output(capsys):
     if get_tqdm_version() < Version("4.49.0"):
         expected = "Epoch [2/2]: [1/2]  50%|█████     , output=red [00:00<00:00]"
     else:
-        expected = "Epoch [2/2]: [1/2]  50%|█████     , output=red [00:00<?]"
+        expected = "Epoch [2/2]: [1/2]  50%|█████████████████                 , output=red [00:00<?]"
     assert err[-1] == expected
 
 
@@ -339,7 +337,7 @@ def test_pbar_with_tqdm_kwargs(capsys):
     loader = [1, 2, 3, 4, 5]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar(desc="My description: ")
+    pbar = ProgressBar(desc="My description: ", ncols=80)
     pbar.attach(engine, output_transform=lambda x: x)
     engine.run(loader, max_epochs=n_epochs)
 
@@ -347,7 +345,7 @@ def test_pbar_with_tqdm_kwargs(capsys):
     err = captured.err.split("\r")
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
-    expected = "My description:  [10/10]: [4/5]  80%|████████  , output=1 [00:00<00:00]"
+    expected = "My description:  [10/10]: [4/5]  80%|███████████████▏   , output=1 [00:00<00:00]"
     assert err[-1] == expected
 
 
@@ -355,7 +353,7 @@ def test_pbar_for_validation(capsys):
     loader = [1, 2, 3, 4, 5]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar(desc="Validation")
+    pbar = ProgressBar(desc="Validation", ncols=80)
     pbar.attach(engine)
     engine.run(loader, max_epochs=1)
 
@@ -363,7 +361,7 @@ def test_pbar_for_validation(capsys):
     err = captured.err.split("\r")
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
-    expected = "Validation: [4/5]  80%|████████   [00:00<00:00]"
+    expected = "Validation: [4/5]  80%|██████████████████████████████████▍         [00:00<00:00]"
     assert err[-1] == expected
 
 
@@ -376,7 +374,7 @@ def update_fn(engine, batch):
 
         engine = Engine(update_fn)
 
-        pbar = ProgressBar(desc="Output tensor")
+        pbar = ProgressBar(desc="Output tensor", ncols=80)
         pbar.attach(engine, output_transform=lambda x: x)
         engine.run(loader, max_epochs=1)
 
@@ -384,12 +382,12 @@ def update_fn(engine, batch):
         err = captured.err.split("\r")
         err = list(map(lambda x: x.strip(), err))
         err = list(filter(None, err))
-        expected = f"Output tensor: [4/5]  80%|████████  , {out_msg} [00:00<00:00]"
+        expected = f"Output tensor: [4/5]  {out_msg} [00:00<00:00]"
         assert err[-1] == expected
 
-    _test(out_tensor=torch.tensor([5, 0]), out_msg="output_0=5, output_1=0")
-    _test(out_tensor=torch.tensor(123), out_msg="output=123")
-    _test(out_tensor=torch.tensor(1.234), out_msg="output=1.23")
+    _test(out_tensor=torch.tensor([5, 0]), out_msg="80%|████████████▊   , output_0=5, output_1=0")
+    _test(out_tensor=torch.tensor(123), out_msg="80%|██████████████████████▍     , output=123")
+    _test(out_tensor=torch.tensor(1.234), out_msg="80%|█████████████████████▌     , output=1.23")
 
 
 def test_pbar_output_warning(capsys):
@@ -400,7 +398,7 @@ def update_fn(engine, batch):
 
     engine = Engine(update_fn)
 
-    pbar = ProgressBar(desc="Output tensor")
+    pbar = ProgressBar(desc="Output tensor", ncols=80)
     pbar.attach(engine, output_transform=lambda x: x)
     with pytest.warns(UserWarning):
         engine.run(loader, max_epochs=1)
@@ -411,7 +409,7 @@ def test_pbar_on_epochs(capsys):
     loader = [1, 2, 3, 4, 5]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, event_name=Events.EPOCH_STARTED, closing_event_name=Events.COMPLETED)
     engine.run(loader, max_epochs=n_epochs)
 
@@ -420,7 +418,7 @@ def test_pbar_on_epochs(capsys):
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
     actual = err[-1]
-    expected = "Epoch: [9/10]  90%|█████████  [00:00<00:00]"
+    expected = "Epoch: [9/10]  90%|██████████████████████████████████████████▎     [00:00<00:00]"
     assert actual == expected
 
 
@@ -429,7 +427,7 @@ def test_pbar_with_max_epochs_set_to_one(capsys):
     loader = [1, 2]
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, ["a"])
 
     engine.run(loader, max_epochs=n_epochs)
@@ -441,13 +439,13 @@ def test_pbar_with_max_epochs_set_to_one(capsys):
     if get_tqdm_version() < Version("4.49.0"):
         expected = "Iteration: [1/2]  50%|█████     , a=1 [00:00<00:00]"
     else:
-        expected = "Iteration: [1/2]  50%|█████     , a=1 [00:00<?]"
+        expected = "Iteration: [1/2]  50%|█████████████████████▌                     , a=1 [00:00<?]"
     assert err[-1] == expected
 
 
 def test_pbar_wrong_events_order():
     engine = Engine(update_fn)
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
 
     with pytest.raises(ValueError, match="should be called before closing event"):
         pbar.attach(engine, event_name=Events.COMPLETED, closing_event_name=Events.COMPLETED)
@@ -475,7 +473,7 @@ def update(engine, batch):
 
     def create_engine():
         engine = Engine(update)
-        pbar = ProgressBar()
+        pbar = ProgressBar(ncols=80)
 
         engine.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan())
         pbar.attach(engine, event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED)
@@ -501,7 +499,7 @@ def test_pbar_on_callable_events(capsys):
     loader = list(range(100))
     engine = Engine(update_fn)
 
-    pbar = ProgressBar()
+    pbar = ProgressBar(ncols=80)
     pbar.attach(engine, event_name=Events.ITERATION_STARTED(every=10), closing_event_name=Events.EPOCH_COMPLETED)
     engine.run(loader, max_epochs=n_epochs)
 
@@ -510,14 +508,14 @@ def test_pbar_on_callable_events(capsys):
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
     actual = err[-1]
-    expected = "Iteration: [90/100]  90%|█████████  [00:00<00:00]"
+    expected = "Iteration: [90/100]  90%|████████████████████████████████████▉     [00:00<00:00]"
     assert actual == expected
 
 
 def test_tqdm_logger_epoch_length(capsys):
     loader = list(range(100))
     engine = Engine(update_fn)
-    pbar = ProgressBar(persist=True)
+    pbar = ProgressBar(persist=True, ncols=80)
     pbar.attach(engine)
     engine.run(loader, epoch_length=50)
 
@@ -526,7 +524,7 @@ def test_tqdm_logger_epoch_length(capsys):
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
     actual = err[-1]
-    expected = "Iteration: [50/50] 100%|██████████ [00:00<00:00]"
+    expected = "Iteration: [50/50] 100%|██████████████████████████████████████████ [00:00<00:00]"
     assert actual == expected
 
 
@@ -546,7 +544,7 @@ def train_step(trainer, batch):
     def restart_iter():
         trainer.state.dataloader = finite_size_data_iter(size)
 
-    pbar = ProgressBar(persist=True)
+    pbar = ProgressBar(persist=True, ncols=80)
     pbar.attach(trainer)
 
     data_iter = finite_size_data_iter(size)
@@ -557,5 +555,5 @@ def restart_iter():
     err = list(map(lambda x: x.strip(), err))
     err = list(filter(None, err))
     actual = err[-1]
-    expected = "Epoch [5/5]: [11/11] 100%|██████████ [00:00<00:00]"
+    expected = "Epoch [5/5]: [11/11] 100%|████████████████████████████████████████ [00:00<00:00]"
     assert actual == expected
diff --git a/tests/ignite/handlers/test_visdom_logger.py b/tests/ignite/handlers/test_visdom_logger.py
index 40657d180cf1..e41f108f291a 100644
--- a/tests/ignite/handlers/test_visdom_logger.py
+++ b/tests/ignite/handlers/test_visdom_logger.py
@@ -16,6 +16,10 @@
     WeightsScalarHandler,
 )
 
+# Run tests on a single worker to avoid issues with connecting to the visdom
+# server This requires that the --dist=loadgroup option be passed to pytest.
+pytestmark = [pytest.mark.timeout(30), pytest.mark.xdist_group(name="visdom")]
+
 
 def test_optimizer_params_handler_wrong_setup():
     with pytest.raises(TypeError):
@@ -948,7 +952,7 @@ def update_fn(engine, batch):
 
 
 @pytest.mark.skipif(sys.platform.startswith("win"), reason="Skip on Windows")
-def test_integration_with_executor_as_context_manager(visdom_server, visdom_server_stop):
+def test_integration_with_executor_as_context_manager(visdom_server):
     n_epochs = 5
     data = list(range(50))
 
diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py
index c2fb75051829..5d8562866c83 100644
--- a/tests/ignite/metrics/nlp/test_rouge.py
+++ b/tests/ignite/metrics/nlp/test_rouge.py
@@ -1,5 +1,7 @@
 import os
 
+import filelock
+
 import nltk
 import pytest
 import rouge as pyrouge
@@ -12,7 +14,20 @@
 
 from . import CorpusForTest
 
-nltk.download("punkt")
+
+@pytest.fixture(scope="session", autouse=True)
+def download_nltk_punkt(worker_id, tmp_path_factory):
+    root_tmp_dir = tmp_path_factory.getbasetemp().parent
+    while True:
+        try:
+            with filelock.FileLock(root_tmp_dir / "nltk_download.lock", timeout=0.2) as fn:
+                fn.acquire()
+                nltk.download("punkt")
+                fn.release()
+                break
+        except filelock._error.Timeout:
+            pass
+
 
 corpus = CorpusForTest()
 
diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py
index a7954e6afa30..35631b2b47e7 100644
--- a/tests/ignite/metrics/test_accuracy.py
+++ b/tests/ignite/metrics/test_accuracy.py
@@ -1,4 +1,6 @@
 import os
+from typing import Callable, Union
+from unittest.mock import MagicMock
 
 import pytest
 import torch
@@ -6,7 +8,7 @@
 from sklearn.metrics import accuracy_score
 
 import ignite.distributed as idist
-from ignite.engine import Engine
+from ignite.engine import Engine, State
 from ignite.exceptions import NotComputableError
 from ignite.metrics import Accuracy
 
@@ -638,3 +640,34 @@ def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl):
     _test_distrib_integration_multilabel(device)
     _test_distrib_accumulator_device(device)
     _test_distrib_integration_list_of_tensors_or_numbers(device)
+
+
+def test_skip_unrolling():
+    class DummyAcc(Accuracy):
+        def __init__(
+            self,
+            true_output,
+            output_transform: Callable = lambda x: x,
+            is_multilabel: bool = False,
+            device: Union[str, torch.device] = torch.device("cpu"),
+            skip_unrolling: bool = False,
+        ):
+            super(DummyAcc, self).__init__(
+                output_transform=output_transform, is_multilabel=False, device=device, skip_unrolling=skip_unrolling
+            )
+            self.true_output = true_output
+
+        def update(self, output):
+            assert output == self.true_output
+
+    a_pred = torch.randint(0, 2, size=(8, 1))
+    b_pred = torch.randint(0, 2, size=(8, 1))
+    y_pred = [a_pred, b_pred]
+    a_true = torch.randint(0, 2, size=(8, 1))
+    b_true = torch.randint(0, 2, size=(8, 1))
+    y_true = [a_true, b_true]
+
+    acc = DummyAcc(true_output=(y_pred, y_true), skip_unrolling=True)
+    state = State(output=(y_pred, y_true))
+    engine = MagicMock(state=state)
+    acc.iteration_completed(engine)
diff --git a/tests/ignite/metrics/test_epoch_metric.py b/tests/ignite/metrics/test_epoch_metric.py
index d82168266b1d..5c42957cf57d 100644
--- a/tests/ignite/metrics/test_epoch_metric.py
+++ b/tests/ignite/metrics/test_epoch_metric.py
@@ -186,3 +186,23 @@ def assert_data_fn(all_preds, all_targets):
 
     assert engine.state.metrics["epm"] == ep_metric_true
     assert ep_metric.compute() == ep_metric_true
+
+
+def test_skip_unrolling():
+    def compute_fn(y_preds, y_targets):
+        return 0.0
+
+    em = EpochMetric(compute_fn, skip_unrolling=True)
+
+    em.reset()
+    output1 = (torch.rand(4, 2), torch.randint(0, 2, size=(4, 2), dtype=torch.long))
+    em.update(output1)
+    output2 = (torch.rand(4, 2), torch.randint(0, 2, size=(4, 2), dtype=torch.long))
+    em.update(output2)
+
+    assert all([t.device.type == "cpu" for t in em._predictions + em._targets])
+    assert torch.equal(em._predictions[0], output1[0])
+    assert torch.equal(em._predictions[1], output2[0])
+    assert torch.equal(em._targets[0], output1[1])
+    assert torch.equal(em._targets[1], output2[1])
+    assert em.compute() == 0.0
diff --git a/tests/ignite/metrics/test_js_divergence.py b/tests/ignite/metrics/test_js_divergence.py
new file mode 100644
index 000000000000..ac4721ef99d9
--- /dev/null
+++ b/tests/ignite/metrics/test_js_divergence.py
@@ -0,0 +1,159 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+from scipy.spatial.distance import jensenshannon
+from scipy.special import softmax
+from torch import Tensor
+
+import ignite.distributed as idist
+from ignite.engine import Engine
+from ignite.exceptions import NotComputableError
+from ignite.metrics import JSDivergence
+
+
+def scipy_js_div(np_y_pred: np.ndarray, np_y: np.ndarray) -> float:
+    y_pred_prob = softmax(np_y_pred, axis=1)
+    y_prob = softmax(np_y, axis=1)
+    # jensenshannon computes the sqrt of the JS divergence
+    js_mean = np.mean(np.square(jensenshannon(y_pred_prob, y_prob, axis=1)))
+    return js_mean
+
+
+def test_zero_sample():
+    js_div = JSDivergence()
+    with pytest.raises(
+        NotComputableError, match=r"JSDivergence must have at least one example before it can be computed"
+    ):
+        js_div.compute()
+
+
+def test_shape_mismatch():
+    js_div = JSDivergence()
+    y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0]], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"y_pred and y must be in the same shape, got"):
+        js_div.update((y_pred, y))
+
+
+def test_invalid_shape():
+    js_div = JSDivergence()
+    y_pred = torch.tensor([2.0, 3.0], dtype=torch.float)
+    y = torch.tensor([4.0, 5.0], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"y_pred must be in the shape of \(B, C\) or \(B, C, ...\), got"):
+        js_div.update((y_pred, y))
+
+
+@pytest.fixture(params=list(range(4)))
+def test_case(request):
+    return [
+        (torch.randn((100, 10)), torch.rand((100, 10)), 1),
+        (torch.rand((100, 500)), torch.randn((100, 500)), 1),
+        # updated batches
+        (torch.normal(0.0, 5.0, size=(100, 10)), torch.rand((100, 10)), 16),
+        (torch.normal(5.0, 3.0, size=(100, 200)), torch.rand((100, 200)), 16),
+        # image segmentation
+        (torch.randn((100, 5, 32, 32)), torch.rand((100, 5, 32, 32)), 16),
+        (torch.rand((100, 5, 224, 224)), torch.randn((100, 5, 224, 224)), 16),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_compute(n_times, test_case: Tuple[Tensor, Tensor, int]):
+    y_pred, y, batch_size = test_case
+
+    js_div = JSDivergence()
+
+    js_div.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            js_div.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        js_div.update((y_pred, y))
+
+    res = js_div.compute()
+
+    np_y_pred = y_pred.numpy()
+    np_y = y.numpy()
+
+    np_res = scipy_js_div(np_y_pred, np_y)
+
+    assert isinstance(res, float)
+    assert pytest.approx(np_res, rel=1e-4) == res
+
+
+def test_accumulator_detached():
+    js_div = JSDivergence()
+
+    y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0], [2.0, 3.0]], dtype=torch.float)
+    js_div.update((y_pred, y))
+
+    assert not js_div._sum_of_kl.requires_grad
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration(self):
+        tol = 1e-4
+        n_iters = 100
+        batch_size = 10
+        n_dims = 100
+
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+
+        for metric_device in metric_devices:
+            y_true = torch.randn((n_iters * batch_size, n_dims)).float().to(device)
+            y_preds = torch.normal(2.0, 3.0, size=(n_iters * batch_size, n_dims)).float().to(device)
+
+            engine = Engine(
+                lambda e, i: (
+                    y_preds[i * batch_size : (i + 1) * batch_size],
+                    y_true[i * batch_size : (i + 1) * batch_size],
+                )
+            )
+
+            m = JSDivergence(device=metric_device)
+            m.attach(engine, "js_div")
+
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=1)
+
+            y_preds = idist.all_gather(y_preds)
+            y_true = idist.all_gather(y_true)
+
+            assert "js_div" in engine.state.metrics
+            res = engine.state.metrics["js_div"]
+
+            y_true_np = y_true.cpu().numpy()
+            y_preds_np = y_preds.cpu().numpy()
+            true_res = scipy_js_div(y_preds_np, y_true_np)
+
+            assert pytest.approx(true_res, rel=tol) == res
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+        for metric_device in metric_devices:
+            js_div = JSDivergence(device=metric_device)
+
+            for dev in (js_div._device, js_div._sum_of_kl.device):
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+
+            y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]]).float()
+            y = torch.ones(2, 2).float()
+            js_div.update((y_pred, y))
+
+            for dev in (js_div._device, js_div._sum_of_kl.device):
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/metrics/test_kl_divergence.py b/tests/ignite/metrics/test_kl_divergence.py
new file mode 100644
index 000000000000..6c9512d42310
--- /dev/null
+++ b/tests/ignite/metrics/test_kl_divergence.py
@@ -0,0 +1,158 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+from scipy.special import softmax
+from scipy.stats import entropy
+from torch import Tensor
+
+import ignite.distributed as idist
+from ignite.engine import Engine
+from ignite.exceptions import NotComputableError
+from ignite.metrics import KLDivergence
+
+
+def scipy_kl_div(np_y_pred: np.ndarray, np_y: np.ndarray) -> float:
+    y_pred_prob = softmax(np_y_pred, axis=1)
+    y_prob = softmax(np_y, axis=1)
+    kl_mean = entropy(y_prob, y_pred_prob, axis=1).mean()
+    return kl_mean
+
+
+def test_zero_sample():
+    kl_div = KLDivergence()
+    with pytest.raises(
+        NotComputableError, match=r"KLDivergence must have at least one example before it can be computed"
+    ):
+        kl_div.compute()
+
+
+def test_shape_mismatch():
+    kl_div = KLDivergence()
+    y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0]], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"y_pred and y must be in the same shape, got"):
+        kl_div.update((y_pred, y))
+
+
+def test_invalid_shape():
+    kl_div = KLDivergence()
+    y_pred = torch.tensor([2.0, 3.0], dtype=torch.float)
+    y = torch.tensor([4.0, 5.0], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"y_pred must be in the shape of \(B, C\) or \(B, C, ...\), got"):
+        kl_div.update((y_pred, y))
+
+
+@pytest.fixture(params=list(range(4)))
+def test_case(request):
+    return [
+        (torch.randn((100, 10)), torch.rand((100, 10)), 1),
+        (torch.rand((100, 500)), torch.randn((100, 500)), 1),
+        # updated batches
+        (torch.normal(0.0, 5.0, size=(100, 10)), torch.rand((100, 10)), 16),
+        (torch.normal(5.0, 3.0, size=(100, 200)), torch.rand((100, 200)), 16),
+        # image segmentation
+        (torch.randn((100, 5, 32, 32)), torch.rand((100, 5, 32, 32)), 16),
+        (torch.rand((100, 5, 224, 224)), torch.randn((100, 5, 224, 224)), 16),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_compute(n_times, test_case: Tuple[Tensor, Tensor, int]):
+    y_pred, y, batch_size = test_case
+
+    kl_div = KLDivergence()
+
+    kl_div.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            kl_div.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        kl_div.update((y_pred, y))
+
+    res = kl_div.compute()
+
+    np_y_pred = y_pred.numpy()
+    np_y = y.numpy()
+
+    np_res = scipy_kl_div(np_y_pred, np_y)
+
+    assert isinstance(res, float)
+    assert pytest.approx(np_res, rel=1e-4) == res
+
+
+def test_accumulator_detached():
+    kl_div = KLDivergence()
+
+    y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0], [2.0, 3.0]], dtype=torch.float)
+    kl_div.update((y_pred, y))
+
+    assert not kl_div._sum_of_kl.requires_grad
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration(self):
+        tol = 1e-4
+        n_iters = 100
+        batch_size = 10
+        n_dims = 100
+
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+
+        for metric_device in metric_devices:
+            y_true = torch.randn((n_iters * batch_size, n_dims)).float().to(device)
+            y_preds = torch.normal(2.0, 3.0, size=(n_iters * batch_size, n_dims)).float().to(device)
+
+            engine = Engine(
+                lambda e, i: (
+                    y_preds[i * batch_size : (i + 1) * batch_size],
+                    y_true[i * batch_size : (i + 1) * batch_size],
+                )
+            )
+
+            m = KLDivergence(device=metric_device)
+            m.attach(engine, "kl_div")
+
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=1)
+
+            y_preds = idist.all_gather(y_preds)
+            y_true = idist.all_gather(y_true)
+
+            assert "kl_div" in engine.state.metrics
+            res = engine.state.metrics["kl_div"]
+
+            y_true_np = y_true.cpu().numpy()
+            y_preds_np = y_preds.cpu().numpy()
+            true_res = scipy_kl_div(y_preds_np, y_true_np)
+
+            assert pytest.approx(true_res, rel=tol) == res
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+        for metric_device in metric_devices:
+            kl_div = KLDivergence(device=metric_device)
+
+            for dev in (kl_div._device, kl_div._sum_of_kl.device):
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+
+            y_pred = torch.tensor([[2.0, 3.0], [-2.0, 1.0]]).float()
+            y = torch.ones(2, 2).float()
+            kl_div.update((y_pred, y))
+
+            for dev in (kl_div._device, kl_div._sum_of_kl.device):
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py
index 19cc68cd45cc..0e945bec58cf 100644
--- a/tests/ignite/metrics/test_loss.py
+++ b/tests/ignite/metrics/test_loss.py
@@ -1,11 +1,12 @@
 import os
+from typing import Tuple
 from unittest.mock import MagicMock
 
 import pytest
 import torch
 from numpy.testing import assert_almost_equal
 from torch import nn
-from torch.nn.functional import nll_loss
+from torch.nn.functional import mse_loss, nll_loss
 
 import ignite.distributed as idist
 from ignite.engine import State
@@ -314,3 +315,50 @@ def compute(self):
         (torch.rand(4, 10), torch.randint(0, 3, size=(4,))),
     ]
     evaluator.run(data)
+
+
+class CustomMultiMSELoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, y_pred: Tuple[torch.Tensor, torch.Tensor], y_true: Tuple[torch.Tensor, torch.Tensor]
+    ) -> torch.Tensor:
+        a_true, b_true = y_true
+        a_pred, b_pred = y_pred
+        return mse_loss(a_pred, a_true) + mse_loss(b_pred, b_true)
+
+
+class DummyLoss3(Loss):
+    def __init__(self, loss_fn, expected_loss, output_transform=lambda x: x, skip_unrolling=False):
+        super(DummyLoss3, self).__init__(loss_fn, output_transform=output_transform, skip_unrolling=skip_unrolling)
+        self._expected_loss = expected_loss
+        self._loss_fn = loss_fn
+
+    def reset(self):
+        pass
+
+    def compute(self):
+        pass
+
+    def update(self, output):
+        y_pred, y_true = output
+        calculated_loss = self._loss_fn(y_pred=y_pred, y_true=y_true)
+        assert calculated_loss == self._expected_loss
+
+
+def test_skip_unrolling_loss():
+    a_pred = torch.rand(8, 1)
+    b_pred = torch.rand(8, 1)
+    y_pred = [a_pred, b_pred]
+    a_true = torch.rand(8, 1)
+    b_true = torch.rand(8, 1)
+    y_true = [a_true, b_true]
+
+    multi_output_mse_loss = CustomMultiMSELoss()
+    expected_loss = multi_output_mse_loss(y_pred=y_pred, y_true=y_true)
+
+    loss_metric = DummyLoss3(loss_fn=multi_output_mse_loss, expected_loss=expected_loss, skip_unrolling=True)
+    state = State(output=(y_pred, y_true))
+    engine = MagicMock(state=state)
+    loss_metric.iteration_completed(engine)
diff --git a/tests/ignite/metrics/test_maximum_mean_discrepancy.py b/tests/ignite/metrics/test_maximum_mean_discrepancy.py
new file mode 100644
index 000000000000..8cfc5f55567d
--- /dev/null
+++ b/tests/ignite/metrics/test_maximum_mean_discrepancy.py
@@ -0,0 +1,176 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+from torch import Tensor
+
+import ignite.distributed as idist
+from ignite.engine import Engine
+from ignite.exceptions import NotComputableError
+from ignite.metrics import MaximumMeanDiscrepancy
+
+
+def np_mmd2(x: np.ndarray, y: np.ndarray, var: float = 1.0):
+    n = x.shape[0]
+    x = x.reshape(n, -1)
+    y = y.reshape(n, -1)
+
+    a = np.arange(n)
+    ii, jj = np.meshgrid(a, a, indexing="ij")
+    XX = np.exp(-np.square(x[ii] - x[jj]).sum(axis=2) / (var * 2))
+    XX = (np.sum(XX) - n) / (n * (n - 1))
+
+    XY = np.exp(-np.square(x[ii] - y[jj]).sum(axis=2) / (var * 2))
+    XY = np.sum(XY) / (n * n)
+
+    YY = np.exp(-np.square(y[ii] - y[jj]).sum(axis=2) / (var * 2))
+    YY = (np.sum(YY) - n) / (n * (n - 1))
+
+    mmd2 = np.clip(XX + YY - XY * 2, 0.0, None)
+    return mmd2
+
+
+def test_zero_sample():
+    mmd = MaximumMeanDiscrepancy()
+    with pytest.raises(
+        NotComputableError, match=r"MaximumMeanDiscrepacy must have at least one batch before it can be computed"
+    ):
+        mmd.compute()
+
+
+def test_shape_mismatch():
+    mmd = MaximumMeanDiscrepancy()
+    x = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0]], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"x and y must be in the same shape, got"):
+        mmd.update((x, y))
+
+
+def test_invalid_shape():
+    mmd = MaximumMeanDiscrepancy()
+    x = torch.tensor([2.0, 3.0], dtype=torch.float)
+    y = torch.tensor([4.0, 5.0], dtype=torch.float)
+    with pytest.raises(ValueError, match=r"x must be in the shape of \(B, ...\), got"):
+        mmd.update((x, y))
+
+
+@pytest.fixture(params=list(range(4)))
+def test_case(request):
+    return [
+        (torch.randn((100, 10)), torch.rand((100, 10)), 10 ** np.random.uniform(-1.0, 0.0), 1),
+        (torch.rand((100, 500)), torch.randn((100, 500)), 10 ** np.random.uniform(-1.0, 0.0), 1),
+        # updated batches
+        (torch.normal(0.0, 5.0, size=(100, 10)), torch.rand((100, 10)), 10 ** np.random.uniform(-1.0, 0.0), 16),
+        (torch.normal(5.0, 3.0, size=(100, 200)), torch.rand((100, 200)), 10 ** np.random.uniform(-1.0, 0.0), 16),
+        # image segmentation
+        (torch.randn((100, 5, 32, 32)), torch.rand((100, 5, 32, 32)), 10 ** np.random.uniform(-1.0, 0.0), 32),
+        (torch.rand((100, 5, 224, 224)), torch.randn((100, 5, 224, 224)), 10 ** np.random.uniform(-1.0, 0.0), 32),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_compute(n_times, test_case: Tuple[Tensor, Tensor, float, int]):
+    x, y, var, batch_size = test_case
+
+    mmd = MaximumMeanDiscrepancy(var=var)
+    mmd.reset()
+
+    if batch_size > 1:
+        np_mmd2_sum = 0.0
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            x_batch, y_batch = x[idx : idx + batch_size], y[idx : idx + batch_size]
+            mmd.update((x_batch, y_batch))
+
+            np_mmd2_sum += np_mmd2(x_batch.cpu().numpy(), y_batch.cpu().numpy(), var)
+
+        np_res = np.sqrt(np_mmd2_sum / n_iters)
+    else:
+        mmd.update((x, y))
+        np_res = np.sqrt(np_mmd2(x.cpu().numpy(), y.cpu().numpy(), var))
+
+    res = mmd.compute()
+
+    assert isinstance(res, float)
+    assert pytest.approx(np_res, abs=1e-4) == res
+
+
+def test_accumulator_detached():
+    mmd = MaximumMeanDiscrepancy()
+
+    x = torch.tensor([[2.0, 3.0], [-2.0, 1.0]], dtype=torch.float)
+    y = torch.tensor([[-2.0, 1.0], [2.0, 3.0]], dtype=torch.float)
+    mmd.update((x, y))
+
+    assert not any(acc.requires_grad for acc in (mmd._xx_sum, mmd._yy_sum, mmd._xy_sum))
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration(self):
+        tol = 1e-4
+        n_iters = 100
+        batch_size = 10
+        n_dims = 100
+
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+
+        for metric_device in metric_devices:
+            y = torch.randn((n_iters * batch_size, n_dims)).float().to(device)
+            x = torch.normal(2.0, 3.0, size=(n_iters * batch_size, n_dims)).float().to(device)
+
+            def data_loader(i):
+                return x[i * batch_size : (i + 1) * batch_size], y[i * batch_size : (i + 1) * batch_size]
+
+            engine = Engine(lambda e, i: data_loader(i))
+
+            m = MaximumMeanDiscrepancy(device=metric_device)
+            m.attach(engine, "mmd")
+
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=1)
+
+            x = idist.all_gather(x)
+            y = idist.all_gather(y)
+
+            assert "mmd" in engine.state.metrics
+            res = engine.state.metrics["mmd"]
+
+            # compute numpy mmd
+            true_res = 0.0
+            for i in range(n_iters):
+                x_batch, y_batch = data_loader(i)
+                x_np = x_batch.cpu().numpy()
+                y_np = y_batch.cpu().numpy()
+                true_res += np_mmd2(x_np, y_np)
+
+            true_res = np.sqrt(true_res / n_iters)
+            assert pytest.approx(true_res, abs=tol) == res
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+        for metric_device in metric_devices:
+            mmd = MaximumMeanDiscrepancy(device=metric_device)
+
+            devices = (mmd._device, mmd._xx_sum.device, mmd._yy_sum.device, mmd._xy_sum.device)
+            for dev in devices:
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+
+            x = torch.tensor([[2.0, 3.0], [-2.0, 1.0]]).float()
+            y = torch.ones(2, 2).float()
+            mmd.update((x, y))
+
+            devices = (mmd._device, mmd._xx_sum.device, mmd._yy_sum.device, mmd._xy_sum.device)
+            for dev in devices:
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py
index f9db11b1a370..645a40b7ac69 100644
--- a/tests/ignite/metrics/test_metric.py
+++ b/tests/ignite/metrics/test_metric.py
@@ -1416,3 +1416,55 @@ def wrapper(x, **kwargs):
             assert (output == expected).all(), (output, expected)
         else:
             assert output == expected, (output, expected)
+
+
+class DummyMetric5(Metric):
+    def __init__(self, true_output, output_transform=lambda x: x, skip_unrolling=False):
+        super(DummyMetric5, self).__init__(output_transform=output_transform, skip_unrolling=skip_unrolling)
+        self.true_output = true_output
+
+    def reset(self):
+        pass
+
+    def compute(self):
+        pass
+
+    def update(self, output):
+        assert output == self.true_output
+
+
+def test_skip_unrolling():
+    # y_pred and y are ouputs recieved from a multi_output model
+    a_pred = torch.rand(8, 1)
+    b_pred = torch.rand(8, 1)
+    y_pred = [a_pred, b_pred]
+    a_true = torch.rand(8, 1)
+    b_true = torch.rand(8, 1)
+    y_true = [a_true, b_true]
+
+    metric = DummyMetric5(true_output=(y_pred, y_true), skip_unrolling=True)
+    state = State(output=(y_pred, y_true))
+    engine = MagicMock(state=state)
+    metric.iteration_completed(engine)
+
+
+class DummyMetric6(Metric):
+    def reset(self):
+        pass
+
+    def compute(self):
+        pass
+
+    def update(self, output):
+        pass
+
+    def __call__(self, value):
+        pass
+
+
+def test_access_to_metric_dunder_attributes():
+    metric = DummyMetric6()
+    import inspect
+
+    # `inspect.signature` accesses `__signature__` attribute of the metric.
+    assert "value" in inspect.signature(metric).parameters.keys()
diff --git a/tests/ignite/metrics/test_metric_group.py b/tests/ignite/metrics/test_metric_group.py
new file mode 100644
index 000000000000..237df966e059
--- /dev/null
+++ b/tests/ignite/metrics/test_metric_group.py
@@ -0,0 +1,118 @@
+import pytest
+import torch
+
+from ignite import distributed as idist
+from ignite.engine import Engine
+from ignite.metrics import Accuracy, MetricGroup, Precision
+
+torch.manual_seed(41)
+
+
+def test_update():
+    precision = Precision()
+    accuracy = Accuracy()
+
+    group = MetricGroup({"precision": Precision(), "accuracy": Accuracy()})
+
+    y_pred = torch.randint(0, 2, (100,))
+    y = torch.randint(0, 2, (100,))
+
+    precision.update((y_pred, y))
+    accuracy.update((y_pred, y))
+    group.update((y_pred, y))
+
+    assert precision.state_dict() == group.metrics["precision"].state_dict()
+    assert accuracy.state_dict() == group.metrics["accuracy"].state_dict()
+
+
+def test_output_transform():
+    def drop_first(output):
+        y_pred, y = output
+        return (y_pred[1:], y[1:])
+
+    precision = Precision(output_transform=drop_first)
+    accuracy = Accuracy(output_transform=drop_first)
+
+    group = MetricGroup(
+        {"precision": Precision(output_transform=drop_first), "accuracy": Accuracy(output_transform=drop_first)}
+    )
+
+    y_pred = torch.randint(0, 2, (100,))
+    y = torch.randint(0, 2, (100,))
+
+    precision.update(drop_first(drop_first((y_pred, y))))
+    accuracy.update(drop_first(drop_first((y_pred, y))))
+    group.update(drop_first((y_pred, y)))
+
+    assert precision.state_dict() == group.metrics["precision"].state_dict()
+    assert accuracy.state_dict() == group.metrics["accuracy"].state_dict()
+
+
+def test_compute():
+    precision = Precision()
+    accuracy = Accuracy()
+
+    group = MetricGroup({"precision": Precision(), "accuracy": Accuracy()})
+
+    for _ in range(3):
+        y_pred = torch.randint(0, 2, (100,))
+        y = torch.randint(0, 2, (100,))
+
+        precision.update((y_pred, y))
+        accuracy.update((y_pred, y))
+        group.update((y_pred, y))
+
+    assert group.compute() == {"precision": precision.compute(), "accuracy": accuracy.compute()}
+
+    precision.reset()
+    accuracy.reset()
+    group.reset()
+
+    assert precision.state_dict() == group.metrics["precision"].state_dict()
+    assert accuracy.state_dict() == group.metrics["accuracy"].state_dict()
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration(self):
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+
+        n_epochs = 3
+        n_iters = 5
+        batch_size = 10
+        device = idist.device()
+
+        y_true = torch.randint(0, 2, size=(n_iters * batch_size,)).to(device)
+        y_pred = torch.randint(0, 2, (n_iters * batch_size,)).to(device)
+
+        def update(_, i):
+            return (
+                y_pred[i * batch_size : (i + 1) * batch_size],
+                y_true[i * batch_size : (i + 1) * batch_size],
+            )
+
+        engine = Engine(update)
+
+        precision = Precision()
+        precision.attach(engine, "precision")
+
+        accuracy = Accuracy()
+        accuracy.attach(engine, "accuracy")
+
+        group = MetricGroup({"eval_metrics.accuracy": Accuracy(), "eval_metrics.precision": Precision()})
+        group.attach(engine, "eval_metrics")
+
+        data = list(range(n_iters))
+        engine.run(data=data, max_epochs=n_epochs)
+
+        assert "eval_metrics" in engine.state.metrics
+        assert "eval_metrics.accuracy" in engine.state.metrics
+        assert "eval_metrics.precision" in engine.state.metrics
+
+        assert engine.state.metrics["eval_metrics"] == {
+            "eval_metrics.accuracy": engine.state.metrics["accuracy"],
+            "eval_metrics.precision": engine.state.metrics["precision"],
+        }
+        assert engine.state.metrics["eval_metrics.accuracy"] == engine.state.metrics["accuracy"]
+        assert engine.state.metrics["eval_metrics.precision"] == engine.state.metrics["precision"]
diff --git a/tests/ignite/metrics/test_mutual_information.py b/tests/ignite/metrics/test_mutual_information.py
new file mode 100644
index 000000000000..18d58d300bfc
--- /dev/null
+++ b/tests/ignite/metrics/test_mutual_information.py
@@ -0,0 +1,145 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+from scipy.special import softmax
+from scipy.stats import entropy
+from torch import Tensor
+
+import ignite.distributed as idist
+
+from ignite.engine import Engine
+from ignite.exceptions import NotComputableError
+from ignite.metrics import MutualInformation
+
+
+def np_mutual_information(np_y_pred: np.ndarray) -> float:
+    prob = softmax(np_y_pred, axis=1)
+    marginal_ent = entropy(np.mean(prob, axis=0))
+    conditional_ent = np.mean(entropy(prob, axis=1))
+    return max(0.0, marginal_ent - conditional_ent)
+
+
+def test_zero_sample():
+    mi = MutualInformation()
+    with pytest.raises(
+        NotComputableError, match=r"MutualInformation must have at least one example before it can be computed"
+    ):
+        mi.compute()
+
+
+def test_invalid_shape():
+    mi = MutualInformation()
+    y_pred = torch.randn(10).float()
+    with pytest.raises(ValueError, match=r"y_pred must be in the shape of \(B, C\) or \(B, C, ...\), got"):
+        mi.update((y_pred, None))
+
+
+@pytest.fixture(params=list(range(4)))
+def test_case(request):
+    return [
+        (torch.randn((100, 10)).float(), torch.randint(0, 10, size=[100]), 1),
+        (torch.rand((100, 500)).float(), torch.randint(0, 500, size=[100]), 1),
+        # updated batches
+        (torch.normal(0.0, 5.0, size=(100, 10)).float(), torch.randint(0, 10, size=[100]), 16),
+        (torch.normal(5.0, 3.0, size=(100, 200)).float(), torch.randint(0, 200, size=[100]), 16),
+        # image segmentation
+        (torch.randn((100, 5, 32, 32)).float(), torch.randint(0, 5, size=(100, 32, 32)), 16),
+        (torch.randn((100, 5, 224, 224)).float(), torch.randint(0, 5, size=(100, 224, 224)), 16),
+    ][request.param]
+
+
+@pytest.mark.parametrize("n_times", range(5))
+def test_compute(n_times, test_case: Tuple[Tensor, Tensor, int]):
+    mi = MutualInformation()
+
+    y_pred, y, batch_size = test_case
+
+    mi.reset()
+    if batch_size > 1:
+        n_iters = y.shape[0] // batch_size + 1
+        for i in range(n_iters):
+            idx = i * batch_size
+            mi.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size]))
+    else:
+        mi.update((y_pred, y))
+
+    np_res = np_mutual_information(y_pred.numpy())
+    res = mi.compute()
+
+    assert isinstance(res, float)
+    assert pytest.approx(np_res, rel=1e-4) == res
+
+
+def test_accumulator_detached():
+    mi = MutualInformation()
+
+    y_pred = torch.tensor([[2.0, 3.0], [-2.0, -1.0]], requires_grad=True)
+    y = torch.zeros(2)
+    mi.update((y_pred, y))
+
+    assert not mi._sum_of_probabilities.requires_grad
+
+
+@pytest.mark.usefixtures("distributed")
+class TestDistributed:
+    def test_integration(self):
+        tol = 1e-4
+        n_iters = 100
+        batch_size = 10
+        n_cls = 50
+        device = idist.device()
+        rank = idist.get_rank()
+        torch.manual_seed(12 + rank)
+
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+
+        for metric_device in metric_devices:
+            y_true = torch.randint(0, n_cls, size=[n_iters * batch_size], dtype=torch.long).to(device)
+            y_preds = torch.normal(0.0, 3.0, size=(n_iters * batch_size, n_cls), dtype=torch.float).to(device)
+
+            engine = Engine(
+                lambda e, i: (
+                    y_preds[i * batch_size : (i + 1) * batch_size],
+                    y_true[i * batch_size : (i + 1) * batch_size],
+                )
+            )
+
+            m = MutualInformation(device=metric_device)
+            m.attach(engine, "mutual_information")
+
+            data = list(range(n_iters))
+            engine.run(data=data, max_epochs=1)
+
+            y_preds = idist.all_gather(y_preds)
+            y_true = idist.all_gather(y_true)
+
+            assert "mutual_information" in engine.state.metrics
+            res = engine.state.metrics["mutual_information"]
+
+            true_res = np_mutual_information(y_preds.cpu().numpy())
+
+            assert pytest.approx(true_res, rel=tol) == res
+
+    def test_accumulator_device(self):
+        device = idist.device()
+        metric_devices = [torch.device("cpu")]
+        if device.type != "xla":
+            metric_devices.append(device)
+        for metric_device in metric_devices:
+            mi = MutualInformation(device=metric_device)
+
+            devices = (mi._device, mi._sum_of_probabilities.device)
+            for dev in devices:
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
+
+            y_pred = torch.tensor([[2.0, 3.0], [-2.0, -1.0]], requires_grad=True)
+            y = torch.zeros(2)
+            mi.update((y_pred, y))
+
+            devices = (mi._device, mi._sum_of_probabilities.device)
+            for dev in devices:
+                assert dev == metric_device, f"{type(dev)}:{dev} vs {type(metric_device)}:{metric_device}"
diff --git a/tests/ignite/test_utils.py b/tests/ignite/test_utils.py
index c4c65a29d696..4b00fb8c67ab 100644
--- a/tests/ignite/test_utils.py
+++ b/tests/ignite/test_utils.py
@@ -1,4 +1,5 @@
 import logging
+import platform
 import sys
 from collections import namedtuple
 
@@ -7,7 +8,7 @@
 from packaging.version import Version
 
 from ignite.engine import Engine, Events
-from ignite.utils import convert_tensor, deprecated, hash_checkpoint, setup_logger, to_onehot
+from ignite.utils import _to_str_list, convert_tensor, deprecated, hash_checkpoint, setup_logger, to_onehot
 
 
 def test_convert_tensor():
@@ -54,6 +55,29 @@ def test_convert_tensor():
         convert_tensor(12345)
 
 
+@pytest.mark.parametrize(
+    "input_data,expected",
+    [
+        (42, ["42.0000"]),
+        ([{"a": 15, "b": torch.tensor([2.0])}], ["a: 15.0000", "b: [2.0000]"]),
+        ({"a": 10, "b": 2.33333}, ["a: 10.0000", "b: 2.3333"]),
+        ({"x": torch.tensor(0.1234), "y": [1, 2.3567]}, ["x: 0.1234", "y: 1.0000, 2.3567"]),
+        (({"nested": [3.1415, torch.tensor(0.0001)]},), ["nested: 3.1415, 0.0001"]),
+        (
+            {"large_vector": torch.tensor(range(20))},
+            ["large_vector: [0.0000, 1.0000, 2.0000, 3.0000, 4.0000, 5.0000, 6.0000, 7.0000, 8.0000, 9.0000, ...]"],
+        ),
+        ({"large_matrix": torch.randn(5, 5)}, ["large_matrix: Shape[5, 5]"]),
+        ({"empty": []}, ["empty: "]),
+        ([], []),
+        ({"none": None}, ["none: "]),
+        ({1: 100, 2: 200}, ["1: 100.0000", "2: 200.0000"]),
+    ],
+)
+def test__to_str_list(input_data, expected):
+    assert _to_str_list(input_data) == expected
+
+
 def test_to_onehot():
     indices = torch.tensor([0, 1, 2, 3], dtype=torch.long)
     actual = to_onehot(indices, 4)
@@ -174,6 +198,29 @@ def test_override_setup_logger(capsys):
     logging.shutdown()
 
 
+@pytest.mark.parametrize("encoding", [None, "utf-8"])
+def test_setup_logger_encoding(encoding, dirname):
+    fp = dirname / "log.txt"
+    logger = setup_logger(name="logger", filepath=fp, encoding=encoding, reset=True)
+    test_words = ["say hello", "say 你好", "say こんにちわ", "say 안녕하세요", "say привет"]
+    for w in test_words:
+        logger.info(w)
+    logging.shutdown()
+
+    with open(fp, "r", encoding=encoding) as h:
+        data = h.readlines()
+
+    if platform.system() == "Windows" and encoding is None:
+        flatten_data = "\n".join(data)
+        assert test_words[0] in flatten_data
+        for word in test_words[1:]:
+            assert word not in flatten_data
+    else:
+        assert len(data) == len(test_words)
+        for expected, output in zip(test_words, data):
+            assert expected in output
+
+
 def test_deprecated():
     # Test on function without docs, @deprecated without reasons
     @deprecated("0.4.2", "0.6.0")
diff --git a/tests/run_cpu_tests.sh b/tests/run_cpu_tests.sh
index 2297be94219d..8d387f5542e7 100644
--- a/tests/run_cpu_tests.sh
+++ b/tests/run_cpu_tests.sh
@@ -1,22 +1,31 @@
 #!/bin/bash
-
+source "$(dirname "$0")/common_test_functionality.sh"
 set -xeu
 
-if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
-    skip_distrib_opt=(-m "not distributed and not tpu and not multinode_distributed")
-else
-    skip_distrib_opt=(-m "")
-fi
+skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0}
+use_last_failed=${USE_LAST_FAILED:-0}
+match_tests_expression=${1:-""}
 
-MATCH_TESTS_EXPRESSION=${1:-""}
 
-CUDA_VISIBLE_DEVICES="" pytest --tx 4*popen//python=python --cov ignite --cov-report term-missing --cov-report xml -vvv tests "${skip_distrib_opt[@]}" -k "$MATCH_TESTS_EXPRESSION"
+run_tests \
+    --core_args "--tx 4*popen//python=python -vvv tests/ignite" \
+    --cache_dir ".cpu-not-distrib" \
+    --skip_distrib_tests "${skip_distrib_tests}" \
+    --use_coverage 1 \
+    --match_tests_expression "${match_tests_expression}" \
+    --use_last_failed ${use_last_failed}
 
 # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
-if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
+if [ "${skip_distrib_tests}" -eq "1" ]; then
     exit 0
 fi
 
-export WORLD_SIZE=2
-CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx $WORLD_SIZE*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
-unset WORLD_SIZE
+# Run 2 processes with --dist=each
+run_tests \
+    --core_args "-m distributed -vvv tests/ignite" \
+    --world_size 2 \
+    --cache_dir ".cpu-distrib" \
+    --skip_distrib_tests 0 \
+    --use_coverage 1 \
+    --match_tests_expression "${match_tests_expression}" \
+    --use_last_failed ${use_last_failed}
diff --git a/tests/run_gpu_tests.sh b/tests/run_gpu_tests.sh
index 3146443a531d..26497f19c83e 100644
--- a/tests/run_gpu_tests.sh
+++ b/tests/run_gpu_tests.sh
@@ -1,35 +1,47 @@
 #!/bin/bash
+source "$(dirname "$0")/common_test_functionality.sh"
+set -xeu
 
-if [ -z "$1" ]; then
-    ngpus=1
-else
-    ngpus=$1
-fi
-
-MATCH_TESTS_EXPRESSION=${2:-""}
+skip_distrib_tests=${SKIP_DISTRIB_TESTS:-1}
+use_last_failed=${USE_LAST_FAILED:-0}
+ngpus=${1:-1}
 
-if [ -z "$MATCH_TESTS_EXPRESSION" ]; then
+match_tests_expression=${2:-""}
+if [ -z "$match_tests_expression" ]; then
     cuda_pattern="cuda"
 else
-    cuda_pattern="cuda and $MATCH_TESTS_EXPRESSION"
+    cuda_pattern="cuda and $match_tests_expression"
 fi
 
-set -xeu
-
-pytest --cov ignite --cov-report term-missing --cov-report xml -vvv tests/ -k "$cuda_pattern"
+run_tests \
+    --core_args "-vvv tests/ignite" \
+    --cache_dir ".gpu-cuda" \
+    --skip_distrib_tests "${skip_distrib_tests}" \
+    --use_coverage 1 \
+    --match_tests_expression "${cuda_pattern}" \
+    --use_last_failed ${use_last_failed}
 
 # https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
-if [ "${SKIP_DISTRIB_TESTS:-0}" -eq "1" ]; then
+if [ "${skip_distrib_tests}" -eq "1" ]; then
     exit 0
 fi
 
-pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k "$MATCH_TESTS_EXPRESSION"
+run_tests \
+    --core_args "-vvv -m distributed tests/ignite" \
+    --cache_dir ".gpu-distrib" \
+    --skip_distrib_tests 0 \
+    --use_coverage 1 \
+    --match_tests_expression "${match_tests_expression}" \
+    --use_last_failed ${use_last_failed}
 
 
 if [ ${ngpus} -gt 1 ]; then
-
-    export WORLD_SIZE=${ngpus}
-    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml --dist=each --tx ${WORLD_SIZE}*popen//python=python tests -m distributed -vvv -k "$MATCH_TESTS_EXPRESSION"
-    unset WORLD_SIZE
-
+    run_tests \
+        --core_args "-vvv -m distributed tests/ignite" \
+        --world_size "${ngpus}" \
+        --cache_dir ".gpu-distrib-multi" \
+        --skip_distrib_tests 0 \
+        --use_coverage 1 \
+        --match_tests_expression "${match_tests_expression}" \
+        --use_last_failed ${use_last_failed}
 fi
diff --git a/tests/run_multinode_tests_in_docker.sh b/tests/run_multinode_tests_in_docker.sh
index 0dca1b603278..041284bb97c2 100644
--- a/tests/run_multinode_tests_in_docker.sh
+++ b/tests/run_multinode_tests_in_docker.sh
@@ -36,7 +36,7 @@ RUN pip install --no-cache-dir mock pytest pytest-xdist scikit-learn scikit-imag
 EOF
 
 docker_python_version=`docker run --rm -i $docker_image python -c "import sys; print(str(sys.version_info[0]) + \".\" + str(sys.version_info[1]), end=\"\")"`
-cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests"
+cmd="pytest --dist=each --tx $nproc_per_node*popen//python${docker_python_version} -m multinode_distributed -vvv tests/ignite"
 
 export MASTER_ADDR=node0
 export MASTER_PORT=9999
diff --git a/tests/run_tpu_tests.sh b/tests/run_tpu_tests.sh
index 0877de858aed..6fd695f2e277 100644
--- a/tests/run_tpu_tests.sh
+++ b/tests/run_tpu_tests.sh
@@ -1,10 +1,20 @@
 #!/bin/bash
-
+source "$(dirname "$0")/common_test_functionality.sh"
 set -xeu
+use_last_failed=${USE_LAST_FAILED:-0}
+
+run_tests \
+    --core_args "-vvv -m tpu tests/ignite" \
+    --cache_dir ".tpu" \
+    --use_coverage 1 \
+    --use_last_failed ${use_last_failed}
 
-pytest --cov ignite --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
 
 if [ -z ${NUM_TPU_WORKERS+x} ]; then
     export NUM_TPU_WORKERS=1
-    pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml tests/ -vvv -m tpu
+    run_tests \
+        --core_args "-vvv -m tpu tests/ignite" \
+        --cache_dir ".tpu-multi" \
+        --use_coverage 1 \
+        --use_last_failed ${use_last_failed}
 fi