diff --git a/.circleci/config.yml b/.circleci/config.yml index 988d585a20c6..df787f80659d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -187,11 +187,6 @@ jobs: export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt' docker exec -it pthd /bin/bash -c "$mnist4_cmd" - - run: - name: Codecov upload - command: | - bash <(curl -s https://codecov.io/bash) -Z -F gpu - one_gpu_windows_tests: <<: *one_gpu_windows @@ -258,11 +253,6 @@ jobs: export test_cmd='bash tests/run_gpu_tests.sh 2' docker exec -it pthd /bin/bash -c "${test_cmd}" - - run: - name: Codecov upload - command: | - bash <(curl -s https://codecov.io/bash) -Z -F gpu-2 - two_gpus_check_dist_cifar10_example: <<: *two_gpus @@ -353,11 +343,6 @@ jobs: export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd' docker exec -it pthd /bin/bash -c "${test_cmd}" - - run: - name: Codecov upload - command: | - bash <(curl -s https://codecov.io/bash) -Z -F gpu-2-hvd - - run: name: "Check CIFAR10 using horovodrun" command: | diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index f9cefe850596..d5a6e8b0bd57 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -28,55 +28,42 @@ jobs: fail-fast: true matrix: pytorch-channel: [pytorch, pytorch-nightly] + env: + AGENT_TOOLSDIRECTORY: /tmp/python steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Get year & week number - id: get-date - run: | - echo "date=$(/bin/date "+%Y-%U")" >> $GITHUB_OUTPUT - - name: Get pip cache dir - id: pip-cache + - name: Clean python tool path run: | - pip install -U pip || python -m pip install -U pip - echo "pip_cache=$(pip cache dir)" >> $GITHUB_OUTPUT + rm -rf ${AGENT_TOOLSDIRECTORY} - - uses: actions/cache@v3 + - uses: actions/setup-python@v4 with: - path: | - ${{ steps.pip-cache.outputs.pip_cache }} - key: ${{ steps.get-date.outputs.date }}-pytorch-${{ runner.os }}-3.8-${{ matrix.pytorch-channel }}-${{ hashFiles('requirements-dev.txt') }} - restore-keys: | - ${{ steps.get-date.outputs.date }}-pytorch-${{ runner.os }}-3.8-${{ matrix.pytorch-channel }}- - - - run: pip install pip wheel setuptools -Uqq + python-version: 3.9 - name: Install PyTorch # https://pytorch.org/get-started/locally/ if: ${{ matrix.pytorch-channel == 'pytorch' }} run: | - pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 + pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117 nvidia-smi python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" + pip list - name: Install PyTorch (nightly) # https://pytorch.org/get-started/locally/ if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }} run: | - pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116 + pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117 nvidia-smi python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" - python -c "import torch; exit(not ('.dev' in torch.__version__))" + pip list - name: Install dependencies run: | pip install -r requirements-dev.txt - python setup.py install + pip install -e . - name: Run 1 Node 2 GPUs Unit Tests run: | diff --git a/.github/workflows/hvd-tests.yml b/.github/workflows/hvd-tests.yml index 3c0914ca9227..628ccfce3230 100644 --- a/.github/workflows/hvd-tests.yml +++ b/.github/workflows/hvd-tests.yml @@ -25,6 +25,7 @@ concurrency: jobs: horovod-tests: runs-on: ubuntu-latest + timeout-minutes: 60 strategy: matrix: python-version: [3.8] diff --git a/.github/workflows/pytorch-version-tests.yml b/.github/workflows/pytorch-version-tests.yml index dd38014a2a4a..1dbf6df7786f 100644 --- a/.github/workflows/pytorch-version-tests.yml +++ b/.github/workflows/pytorch-version-tests.yml @@ -16,40 +16,41 @@ jobs: matrix: python-version: [3.8, 3.9, "3.10"] pytorch-version: - [1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0, 1.3.1] + [1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0] exclude: - - pytorch-version: 1.3.1 - python-version: 3.8 - - pytorch-version: 1.3.1 - python-version: 3.9 - pytorch-version: 1.4.0 python-version: 3.9 + - pytorch-version: 1.4.0 + python-version: 3.10 + - pytorch-version: 1.5.1 python-version: 3.9 + - pytorch-version: 1.5.1 + python-version: 3.10 + - pytorch-version: 1.6.0 python-version: 3.9 + - pytorch-version: 1.6.0 + python-version: 3.10 + # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail. # https://github.com/pytorch/ignite/issues/2383 - pytorch-version: 1.7.1 python-version: 3.9 - - pytorch-version: 1.8.1 - python-version: 3.9 - - pytorch-version: 1.3.1 - python-version: 3.10 - - pytorch-version: 1.4.0 - python-version: 3.10 - - pytorch-version: 1.5.1 - python-version: 3.10 - - pytorch-version: 1.6.0 - python-version: 3.10 - pytorch-version: 1.7.1 python-version: 3.10 + + - pytorch-version: 1.8.1 + python-version: 3.9 - pytorch-version: 1.8.1 python-version: 3.10 + - pytorch-version: 1.9.1 python-version: 3.10 + - pytorch-version: 1.10.0 python-version: 3.10 + - pytorch-version: 1.11.0 python-version: 3.10 @@ -86,17 +87,22 @@ jobs: - name: Install dependencies shell: bash -l {0} + if: ${{ matrix.pytorch-version != '1.4.0' }} run: | conda install pytorch=${{ matrix.pytorch-version }} torchvision cpuonly python=${{ matrix.python-version }} -c pytorch pip install -r requirements-dev.txt python setup.py install - - name: Install appropriate Pillow for PyTorch 1.3.1 + # There is no more torchvision 0.5.0 binaries in anaconda pytorch channel: + # https://anaconda.org/pytorch/torchvision/files + - name: Install appropriate dependencies for PyTorch 1.4.0 shell: bash -l {0} - if: ${{ matrix.pytorch-version == '1.3.1' }} - run: | - pip install --upgrade 'Pillow<7' - python -c "import torchvision" + if: ${{ matrix.pytorch-version == '1.4.0' }} + run: | + conda install pytorch=${{ matrix.pytorch-version }} cpuonly python=${{ matrix.python-version }} -c pytorch + pip install torchvision==0.5.0 + pip install -r requirements-dev.txt + python setup.py install - name: Download MNIST uses: pytorch-ignite/download-mnist-github-action@master diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 699f233ee2cc..2c409f7227a4 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -40,18 +40,18 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest] - python-version: [3.8, 3.9, "3.10"] + python-version: [3.8, 3.9, "3.10", "3.11"] pytorch-channel: [pytorch, pytorch-nightly] include: # includes a single build on windows - os: windows-latest pytorch-channel: pytorch - python-version: 3.8 + python-version: 3.9 skip-distrib-tests: 1 # includes a single build on macosx - os: macos-latest pytorch-channel: pytorch - python-version: 3.8 + python-version: 3.9 skip-distrib-tests: 1 steps: @@ -184,3 +184,8 @@ jobs: #train mkdir -p ~/.cache/torch/checkpoints/ && wget "https://download.pytorch.org/models/vgg16-397923af.pth" -O ~/.cache/torch/checkpoints/vgg16-397923af.pth python examples/fast_neural_style/neural_style.py train --epochs 1 --cuda 0 --dataset test --dataroot . --image_size 32 --style_image examples/fast_neural_style/images/style_images/mosaic.jpg --style_size 32 + - name: Run SR Example + if: ${{ matrix.os == 'ubuntu-latest' }} + run: | + # Super-Resolution + python examples/super_resolution/main.py --upscale_factor 3 --crop_size 180 --batch_size 4 --test_batch_size 100 --n_epochs 1 --lr 0.001 --threads 2 --debug diff --git a/docker/README.md b/docker/README.md index 32660e43383d..bca8f0add8cb 100644 --- a/docker/README.md +++ b/docker/README.md @@ -48,6 +48,8 @@ Available Tensor Operations: - `docker pull pytorchignite/hvd-apex-vision:latest` - [hvd/Dockerfile.hvd-apex-nlp](hvd/Dockerfile.hvd-apex-nlp): base Horovod apex with useful NLP libraries - `docker pull pytorchignite/hvd-apex-nlp:latest` + +**Deprecated images** (no version updates) - [msdp/Dockerfile.msdp-apex-base](msdp/Dockerfile.msdp-apex): multi-stage MSDeepSpeed build with latest Pytorch, Ignite image with minimal dependencies - `docker pull pytorchignite/msdp-apex:latest` - [msdp/Dockerfile.msdp-apex-vision](msdp/Dockerfile.msdp-apex-vision): base MSDeepSpeed build with useful computer vision libraries @@ -58,7 +60,7 @@ Available Tensor Operations: ## How to use ```bash -docker run -it -v $PWD:/workspace/project --network=host --shm-size 16G pytorchignite/base:latest /bin/bash +docker run -it -v $PWD:/workspace/project --network=host --ipc=host pytorchignite/base:latest /bin/bash ``` ## Building the image yourself diff --git a/docker/docker.cfg b/docker/docker.cfg index 76c9fa057d94..a5fcf2f5127a 100644 --- a/docker/docker.cfg +++ b/docker/docker.cfg @@ -1,4 +1,4 @@ [DEFAULT] -build_docker_image_pytorch_version = 1.12.1-cuda11.3-cudnn8 -build_docker_image_hvd_version = v0.25.0 -build_docker_image_msdp_version = v0.7.2 +build_docker_image_pytorch_version = 1.13.1-cuda11.6-cudnn8 +build_docker_image_hvd_version = v0.27.0 +build_docker_image_msdp_version = v0.8.1 diff --git a/docker/hvd/Dockerfile.hvd-apex b/docker/hvd/Dockerfile.hvd-apex index fe46284c8d3b..a465abdaba24 100644 --- a/docker/hvd/Dockerfile.hvd-apex +++ b/docker/hvd/Dockerfile.hvd-apex @@ -6,14 +6,9 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-hvd-builder -ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0 8.6" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST - -# Renew nvidia signing key -# https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ -RUN apt-key del 7fa2af80 && \ - rm /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +ENV CUDA_HOME=/usr/local/cuda # Install git RUN apt-get update && apt-get install -y --no-install-recommends git && \ @@ -25,14 +20,15 @@ RUN echo "Setup NVIDIA Apex" && \ rm -rf $tmp_apex_path && \ git clone https://github.com/NVIDIA/apex $tmp_apex_path && \ cd $tmp_apex_path && \ - pip wheel --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . + pip install packaging && \ + pip wheel -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . ARG HVD_VERSION # Build Horovod RUN apt-get update && apt-get install -y git && \ git clone --recursive --depth 1 --branch ${HVD_VERSION} https://github.com/horovod/horovod.git /horovod && \ - conda install -y cmake nccl=2.11 -c conda-forge && \ + conda install -y cmake nccl -c conda-forge && \ cd /horovod && \ HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip wheel --no-cache-dir . && \ rm -rf /var/lib/apt/lists/* @@ -80,7 +76,7 @@ RUN mkdir -p pytorch-ignite-examples && \ rm -rf .git # Horovod -RUN conda install -y nccl=2.11 -c conda-forge +RUN conda install -y nccl -c conda-forge ENV LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH diff --git a/docker/hvd/Dockerfile.hvd-base b/docker/hvd/Dockerfile.hvd-base index 29b7893fdb6f..3bdec5efdc43 100644 --- a/docker/hvd/Dockerfile.hvd-base +++ b/docker/hvd/Dockerfile.hvd-base @@ -7,16 +7,10 @@ FROM pytorch/pytorch:${PTH_VERSION}-devel as builder ARG HVD_VERSION -# Renew nvidia signing key -# https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ -RUN apt-key del 7fa2af80 && \ - rm /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub - # Build Horovod RUN apt-get update && apt-get install -y git && \ git clone --recursive --depth 1 --branch ${HVD_VERSION} https://github.com/horovod/horovod.git /horovod && \ - conda install -y cmake nccl=2.11 -c conda-forge && \ + conda install -y cmake nccl -c conda-forge && \ cd /horovod && \ HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_NCCL_LINK=SHARED HOROVOD_WITHOUT_MPI=1 HOROVOD_WITH_PYTORCH=1 pip wheel --no-cache-dir . && \ rm -rf /var/lib/apt/lists/* @@ -59,7 +53,7 @@ RUN mkdir -p pytorch-ignite-examples && \ rm -rf .git # Horovod -RUN conda install -y nccl=2.11 -c conda-forge +RUN conda install -y nccl -c conda-forge ENV LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH diff --git a/docker/main/Dockerfile.apex b/docker/main/Dockerfile.apex index 8e13901c158f..d39445071646 100644 --- a/docker/main/Dockerfile.apex +++ b/docker/main/Dockerfile.apex @@ -6,14 +6,9 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-builder -ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0 8.6" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST - -# Renew nvidia signing key -# https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ -RUN apt-key del 7fa2af80 && \ - rm /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +ENV CUDA_HOME=/usr/local/cuda # Install git RUN apt-get update && apt-get install -y --no-install-recommends git && \ @@ -25,7 +20,8 @@ RUN echo "Setup NVIDIA Apex" && \ rm -rf $tmp_apex_path && \ git clone https://github.com/NVIDIA/apex $tmp_apex_path && \ cd $tmp_apex_path && \ - pip wheel --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . + pip install packaging && \ + pip wheel -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . # 2/ Build the runtime image FROM pytorch/pytorch:${PTH_VERSION}-runtime diff --git a/docker/msdp/Dockerfile.msdp-apex b/docker/msdp/Dockerfile.msdp-apex index 588738ca367c..32dce0fea00f 100644 --- a/docker/msdp/Dockerfile.msdp-apex +++ b/docker/msdp/Dockerfile.msdp-apex @@ -6,14 +6,9 @@ ARG PTH_VERSION # 1/Building apex with pytorch:*-devel FROM pytorch/pytorch:${PTH_VERSION}-devel AS apex-msdp-builder -ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" +ARG ARG_TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0 8.6" ENV TORCH_CUDA_ARCH_LIST=$ARG_TORCH_CUDA_ARCH_LIST - -# Renew nvidia signing key -# https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/ -RUN apt-key del 7fa2af80 && \ - rm /etc/apt/sources.list.d/nvidia-ml.list && \ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +ENV CUDA_HOME=/usr/local/cuda # Install git RUN apt-get update && apt-get install -y --no-install-recommends git && \ @@ -25,7 +20,8 @@ RUN echo "Setup NVIDIA Apex" && \ rm -rf $tmp_apex_path && \ git clone https://github.com/NVIDIA/apex $tmp_apex_path && \ cd $tmp_apex_path && \ - pip wheel --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . + pip install packaging && \ + pip wheel -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . # For pip --use-feature option RUN python -m pip install --upgrade pip diff --git a/docker/push_all.sh b/docker/push_all.sh index ab45baf85623..fd3c00e9232b 100644 --- a/docker/push_all.sh +++ b/docker/push_all.sh @@ -43,16 +43,17 @@ do done -image_name="msdp-apex" -image_tag=`docker run --rm -i pytorchignite/${image_name}:latest python -c "import torch; import ignite; print(torch.__version__ + \"-\" + ignite.__version__, end=\"\")"` +# DEPRECATED due to no activity +# image_name="msdp-apex" +# image_tag=`docker run --rm -i pytorchignite/${image_name}:latest python -c "import torch; import ignite; print(torch.__version__ + \"-\" + ignite.__version__, end=\"\")"` -for image_name in "msdp-apex" "msdp-apex-vision" "msdp-apex-nlp" -do +# for image_name in "msdp-apex" "msdp-apex-vision" "msdp-apex-nlp" +# do - docker push pytorchignite/${image_name}:latest - docker push pytorchignite/${image_name}:${image_tag} +# docker push pytorchignite/${image_name}:latest +# docker push pytorchignite/${image_name}:${image_tag} -done +# done # If use locally, mind to clean dangling images # docker images | grep 'pytorchignite\|' | awk '{print $3}' | xargs docker rmi -f diff --git a/docs/requirements.txt b/docs/requirements.txt index ee631c90e6bd..874e5773ba39 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ -sphinx==4.2.0 +sphinx==5.0.0 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinxcontrib-katex sphinx-copybutton==0.4.0 docutils<0.18 sphinx_togglebutton -sphinx_design \ No newline at end of file +sphinx_design diff --git a/examples/contrib/mnist/mnist_with_neptune_logger.py b/examples/contrib/mnist/mnist_with_neptune_logger.py index eff1a0418ffe..63f31eb9ed6f 100644 --- a/examples/contrib/mnist/mnist_with_neptune_logger.py +++ b/examples/contrib/mnist/mnist_with_neptune_logger.py @@ -2,7 +2,7 @@ MNIST example with training and validation monitoring using Neptune. Requirements: - Neptune: `pip install neptune-client` + Neptune: `pip install neptune` Usage: @@ -11,11 +11,11 @@ python mnist_with_neptune_logger.py ``` - Go to https://neptune.ai and explore your experiment. + Go to https://neptune.ai and explore your run. Note: - You can see an example experiment here: - https://ui.neptune.ai/o/shared/org/pytorch-ignite-integration/e/PYTOR-26/charts + You can view example runs here: + https://app.neptune.ai/o/common/org/pytorch-ignite-integration/ """ from argparse import ArgumentParser @@ -100,17 +100,18 @@ def compute_metrics(engine): npt_logger = NeptuneLogger( api_token="ANONYMOUS", - project_name="shared/pytorch-ignite-integration", + project="common/pytorch-ignite-integration", name="ignite-mnist-example", - params={ - "train_batch_size": train_batch_size, - "val_batch_size": val_batch_size, - "epochs": epochs, - "lr": lr, - "momentum": momentum, - }, ) + npt_logger.experiment["params"] = { + "train_batch_size": train_batch_size, + "val_batch_size": val_batch_size, + "epochs": epochs, + "lr": lr, + "momentum": momentum, + } + npt_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED(every=100), diff --git a/examples/notebooks/TextCNN.ipynb b/examples/notebooks/TextCNN.ipynb index 2a9eb56ba9d4..82fef682f7db 100644 --- a/examples/notebooks/TextCNN.ipynb +++ b/examples/notebooks/TextCNN.ipynb @@ -6,7 +6,7 @@ "id": "RfRKTxQO51bK" }, "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/pytorch/ignite/blob/master/examples/notebooks/TextCNN.ipynb)" + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pytorch/ignite/blob/master/examples/notebooks/TextCNN.ipynb)" ] }, { @@ -42,7 +42,7 @@ "source": [ "## Required Dependencies \n", "\n", - "In this example we only need torchtext and spacy package, assuming that `torch` and `ignite` are already installed. We can install it using `pip`:\n", + "In this example we only need `torchtext` and `spacy` package, assuming that `torch` and `ignite` are already installed. We can install it using `pip`:\n", "\n", "`pip install torchtext==0.9.1 spacy`\n", "\n", @@ -1002,4 +1002,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/examples/references/classification/imagenet/NOTES_ClearML.md b/examples/references/classification/imagenet/NOTES_ClearML.md deleted file mode 100644 index 6274191c9544..000000000000 --- a/examples/references/classification/imagenet/NOTES_ClearML.md +++ /dev/null @@ -1,208 +0,0 @@ -# Experiments tracking with ClearML - -[Allegro ClearML](https://allegro.ai/clearml/docs/) is a full system open source ML / DL experiment manager and ML-Ops solution. -It is composed of a server, Python SDK and web UI. **Allegro ClearML** enables data scientists and data engineers -to effortlessly track, manage, compare and collaborate on their experiments as well as easily manage their -training workloads on remote machines. - -## Install ClearML - -Install [clearml](https://github.com/allegroai/clearml) by executing the following command: - -```bash -pip install --upgrade clearml -``` - -## Install requirements - -```bash -pip install -r requirements.txt -``` - -We need to also install Nvidia/APEX and libraries for opencv. -**Important**, please, check the content of `experiments/setup_opencv.sh` before running the script. - -```bash -sh experiments/setup_apex.sh - -sh experiments/setup_opencv.sh -``` - -#### Download ImageNet dataset - -Since 10/2019, we need to register an account in order to download the dataset. -To download the dataset, use the following form : http://www.image-net.org/download.php - -## Setup the environment variables - -### Setup the dataset path - -To configure the path to already existing ImageNet dataset, please specify `DATASET_PATH` environment variable - -```bash -export DATASET_PATH=/path/to/imagenet -# export DATASET_PATH=$PWD/input/imagenet -``` - -## Run the experiment code - -In **ClearML**, when you run the experiment code, `clearml` stores the experiment in [clearml-server](https://github.com/allegroai/clearml-server). - -By default, `clearml` works with the demo **ClearML Server** ([https://demoapp.trains.allegro.ai/dashboard](https://demoapp.trains.allegro.ai/dashboard)), -which is open to anyone (although once a week it is refreshing and deleting all data). You can also set up your own [self-hosted](https://github.com/allegroai/clearml-server) **ClearML Server**. - -After the experiment code runs once, you can [reproduce the experiment](#reproducing-the-experiment) using the -**ClearML Web-App (UI)**, which is part of `clearml-server`. You only need to run the code once to store it -in `clearml-server`. - -### Setup - -This setup is a specific for this code and is not required in general usage of ClearML. -We setup an output path as a local storage: - -```bash -export CLEARML_OUTPUT_PATH=/path/to/output/clearml -# e.g export CLEARML_OUTPUT_PATH=$PWD/output/clearml -``` - -This environment variable helps to choose ClearML as experiment tracking system among all others. - -### ClearML fileserver setup - -The configuration to upload artifact must be done by modifying the `clearml` configuration file `~/clearml.conf` -generated by `clearml-init`. According to the -[documentation](https://allegro.ai/docs/examples/reporting/artifacts/), the `output_uri` argument can be -configured in `sdk.development.default_output_uri` to fileserver uri. If server is self-hosted, `ClearML` fileserver uri is -`http://localhost:8081`. - -For more details, see https://allegro.ai/docs/examples/reporting/artifacts/ - -### Run the code - -#### Training on single node and single GPU - -Please, make sure to adapt training data loader batch size to your GPU type. By default, batch size is 64 per process. - -Execute the following command: - -```bash -export CLEARML_OUTPUT_PATH=/path/to/output/clearml -# e.g export CLEARML_OUTPUT_PATH=$PWD/output/clearml -export PYTHONPATH=$PWD/code:$PYTHONPATH - -py_config_runner ./code/scripts/training.py ./configs/train/baseline_resnet50.py -``` - -#### Training on single node and multiple GPUs - -Please, make sure to adapt training data loader batch size to your GPU type. By default, batch size is 64 per process. - -```bash -export CLEARML_OUTPUT_PATH=/path/to/output/clearml -# e.g export CLEARML_OUTPUT_PATH=$PWD/output/clearml -export PYTHONPATH=$PWD/code:$PYTHONPATH - -python -m torch.distributed.launch --nproc 2 --use_env -m py_config_runner ./code/scripts/training.py ./configs/train/baseline_resnet50.py -``` - -In **ClearML Web-App** a new project named _"ImageNet Training"_ will be created, -with an experiment named _"baseline_resnet50"_ inside. - -In your local environment, the console output includes the URL of the experiment's **RESULTS** page. - -You can now view your experiment in **ClearML** by clicking the link or copying the URL into your browser. -It opens the results in the experiment's details pane, in the **ClearML Web-App (UI)**. - -#### ClearML automatic Logging - -When the experiment code runs, **ClearML** automatically logs your environment, code, and the outputs. -Which means that you don't need to change your code. - -All you need is 2 lines of integration at the top of your main script - -```python -from clearml import Task -Task.init("ImageNet Training", "baseline_resnet50") -``` - -Once it's there, the following will be automatically logged by **ClearML**: - -- **Resource Monitoring** CPU/GPU utilization, temperature, IO, network, etc -- **Development Environment** Python environment, Git (repo, branch, commit) including uncommitted changes -- **Configuration** Including configuration files, command line arguments (ArgParser), and general dictionaries -- Full **stdout** and **stderr** automatic logging -- Model snapshots, with optional automatic upload to central storage. -- Artifacts log & store, including shared folders, S3, GS, Azure, and Http/s -- Matplotlib / Seaborn / TensorBoard / TensorBoardX scalars, metrics, histograms, images, audio, video, etc - -Additionally, **ClearML** supports explicit logging by adding calls to the **ClearML** Python client `Logger` -class methods in the code. For more information, -see [Explicit Reporting](https://allegro.ai/docs/examples/examples_explicit_reporting/) in the **ClearML** documentation. - -## Track the experiment and visualize the results - -In the **ClearML Web-App (UI)**, track the experiment and visualize results in the experiment's details pane, -which is organized in tabs and provides the following information; - -- Source code, uncommitted changes, Python packages and versions, and other information, in the **EXECUTION** tab -- Hyperparameters in the **HYPERPARAMETERS** tab -- Input model, Configuration, Output model, and other artifacts in the **ARTIFACTS** tab -- Experiment Comments and General experiment information in the **INFO** tab -- Results in the **RESULTS** tab, including the log, scalar metric plots, plots of any data, and debug samples - -## Reproducing the experiments - -In **ClearML**, reproduce experiments using `clearml-agent` for remote execution. Rerun the same experiment, -by making an exact copy of it (a clone), and remotely execute the cloned experiment. - -First, install `clearml-agent` and then configure it to work with your self-hosted **ClearML Server**. - -Once `clearml-agent` is installed and configured, run `clearml-agent daemon`. -In **ClearML**, we call these _workers_, they pop experiments from a job execution queue and execute them. -Every machine with a _clearml-agent daemon_, becomes a registered _worker_ in your **clearml-server** cluster. - -Using the **ClearML Web-App** you can easily send experiments to be remotely executed on one of these machines. - -More details can be found on the _clearml-agent_ [github](https://github.com/allegroai/clearml-agent/) - -### Install and configure clearml-agent - -1. Install `clearml-agent` - - pip install clearml-agent - -1. Configure `clearml-agent` by running the setup wizard - - clearml-agent init - -### Remotely execute the experiment - -1. Start a **ClearML** worker. Run a `clearml-agent daemon` listening to a queue - - For example, run a `clearml-agent daemon` listening to the `default` queue and using multiple GPUs. - - clearml-agent daemon --gpus 0,1 --queue default - -1. Locate the experiment. In the **ClearML Web-App (UI)**, Projects page, click on the project card - -1. Make a copy of the experiment - - 1. In the experiment table, right-click the experiment - 1. On the sub-menu, select **Clone** - 1. Select the project, type a name for the copy, and type a description, or accept the defaults - 1. Click the **CLONE** button - - The copy of the experiment is created. Its details pane opens. - -1. Send the experiment for remote execution, by enqueuing it in one of the job execution queues - - 1. In the experiment table, right-click the experiment - 1. On the sub-menu, select **Enqueue** - 1. Select the _default_ queue - 1. Click the **ENQUEUE** button - - The experiment's status changes to Pending. - -When the experiment reaches the top of the job execution queue, the `clearml-agent deamon` fetches it, -its status changes to Running, and `clearml-agent` executes it while logging and monitoring. -You can track the experiment while it is in progress, and anytime afterwards. diff --git a/examples/references/classification/imagenet/NOTES_MLflow.md b/examples/references/classification/imagenet/NOTES_MLflow.md deleted file mode 100644 index b9074b3c8c50..000000000000 --- a/examples/references/classification/imagenet/NOTES_MLflow.md +++ /dev/null @@ -1,127 +0,0 @@ -# Experiments tracking with MLflow - -User can run ImageNet training using MLflow experiments tracking system on the local machine. - -## Requirements - -We use `conda` and [MLflow](https://github.com/mlflow/mlflow) to -handle experiments/runs and all python dependencies. -Please, install these tools: - -- [MLflow](https://github.com/mlflow/mlflow): `pip install mlflow` -- [conda](https://conda.io/en/latest/miniconda.html) - -We need to also install Nvidia/APEX and libraries for opencv. APEX is automatically installed on the first run. -Manually, all can be installed with the following commands. -**Important**, please, check the content of `experiments/setup_opencv.sh` before running. - -```bash -sh experiments/setup_apex.sh - -sh experiments/setup_opencv.sh -``` - -## Usage - -### Download ImageNet-1k dataset - -Since 10/2019, we need to register an account in order to download the dataset. -To download the dataset, use the following form : http://www.image-net.org/download.php - -### Setup dataset path - -To configure the path to already existing ImageNet dataset, please specify `DATASET_PATH` environment variable - -```bash -export DATASET_PATH=/path/to/imagenet -# export DATASET_PATH=$PWD/input/imagenet -``` - -### MLflow setup - -Setup mlflow output path as a local storage (option with remote storage is not supported): - -```bash -export MLFLOW_TRACKING_URI=/path/to/output/mlruns -# e.g export MLFLOW_TRACKING_URI=$PWD/output/mlruns -``` - -Create once "Trainings" experiment - -```bash -mlflow experiments create -n Trainings -``` - -or check existing experiments: - -```bash -mlflow experiments list -``` - -### Training on single node with single GPU - -Please, make sure to adapt training data loader batch size to your GPU type. By default, batch size is 64. - -```bash -export MLFLOW_TRACKING_URI=/path/to/output/mlruns -# e.g export MLFLOW_TRACKING_URI=$PWD/output/mlruns -mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs/train/baseline_r50.py -P num_gpus=1 -``` - -### Training on single node with multiple GPUs - -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. -By default, batch size is 64 per process. - -```bash -export MLFLOW_TRACKING_URI=/path/to/output/mlruns -# e.g export MLFLOW_TRACKING_URI=$PWD/output/mlruns -mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs/train/baseline_r50.py -P num_gpus=2 -``` - -## Training tracking - -### MLflow dashboard - -To visualize experiments and runs, user can start mlflow dashboard: - -```bash -mlflow server --backend-store-uri /path/to/output/mlruns --default-ainfrastructure/path/to/output/mlruns -p 6026 -h 0.0.0.0 -# e.g mlflow server --backend-store-uri $PWD/output/mlruns --default-artifact-root $PWD/output/mlruns -p 6026 -h 0.0.0.0 -``` - -### Tensorboard dashboard - -To visualize experiments and runs, user can start tensorboard: - -```bash -tensorboard --logdir /path/to/output/mlruns/1 -# e.g tensorboard --logdir $PWD/output/mlruns/1 -``` - -where `/1` points to "Training" experiment. - -## Implementation details - -Files tree description: - -``` -code -configs -experiments/mlflow : MLflow related files -notebooks -``` - -### Experiments - -- [conda.yaml](experiments/mlflow/conda.yaml): defines all python dependencies necessary for our experimentations -- [MLproject](experiments/mlflow/MLproject): defines types of experiments we would like to perform by "entry points": - - main : starts single-node multi-GPU training script - -When we execute - -```bash -mlflow run experiments/mlflow --experiment-name=Trainings -P config_path=configs/train/baseline_r50.py -P num_gpus=2 -``` - -it executes `main` entry point from [MLproject](experiments/mlflow/MLproject) and runs provided command. diff --git a/examples/references/classification/imagenet/NOTES_Polyaxon.md b/examples/references/classification/imagenet/NOTES_Polyaxon.md deleted file mode 100644 index a761f33146ae..000000000000 --- a/examples/references/classification/imagenet/NOTES_Polyaxon.md +++ /dev/null @@ -1,60 +0,0 @@ -# Experiments tracking with Polyaxon - -User can run ImageNet training using [Polyaxon experiments tracking system](https://polyaxon.com/). - -## Requirements - -In this case we assume, user has [Polyaxon](https://polyaxon.com/) installed on a machine/cluster/cloud and can schedule experiments with `polyaxon-cli`. - -## Usage - -### Setup Polyaxon project - -Create project on the cluster - -```bash -polyaxon project create --name=imagenet --description="Classification on ImageNet" -``` - -Initialize local project - -```bash -polyaxon init imagenet -``` - -Please rename and modify `experiments/plx/xp_training.yml.tmpl` to `experiments/plx/xp_training.yml` -to adapt to your cluster configuration. - -#### Download ImageNet dataset - -Since 10/2019, we need to register an account in order to download the dataset. -To download the dataset, use the following form : http://www.image-net.org/download.php - -### Training on single node with single or multiple GPU - -For optimal devices usage, please, make sure to adapt training data loader batch size to your infrastructure. -By default, batch size is 64 per process. Please, adapt `xp_training.yml` to your cluster configuration and run it, for example, as - -```bash -polyaxon run -u -f experiments/plx/xp_training.yml --name="baseline_resnet50" --tags=train,resnet50 -``` - -## Training tracking - -Please, see Polyaxon dashboard usage at https://docs.polyaxon.com/ - -## Implementation details - -Files tree description: - -``` -code -configs -experiments/plx : Polyaxon related files -notebooks -``` - -### Experiments - -File [xp_training.yml.tmpl](experiments/mlflow/xp_training.yml.tmpl) defines all configurations and dependencies -necessary for our experimentations. Part `run.cmd` starts single-node multi-GPU training script. diff --git a/examples/references/classification/imagenet/README.md b/examples/references/classification/imagenet/README.md index 9228d483a022..02efeba9dc57 100644 --- a/examples/references/classification/imagenet/README.md +++ b/examples/references/classification/imagenet/README.md @@ -5,87 +5,60 @@ dataset. Features: -- Distributed training with mixed precision by [nvidia/apex](https://github.com/NVIDIA/apex/) -- Experiments tracking with [MLflow](https://mlflow.org/) or [Polyaxon](https://polyaxon.com/) or [ClearML](https://github.com/allegroai/clearml) +- Distributed training with native automatic mixed precision +- Experiments tracking with [ClearML](https://github.com/allegroai/clearml) -![tb_dashboard](assets/tb_dashboard.png) - -There are three possible options: 1) Experiments tracking with MLflow, 2) Experiments tracking with Polyaxon or 3) Experiments tracking with ClearML. - -Experiments tracking with ClearML / MLflow is more suitable for a local machine with GPU(s). For experiments tracking with Polyaxon -user needs to have Polyaxon installed on a machine/cluster/cloud and can schedule experiments with `polyaxon-cli`. -User can choose one option and skip the descriptions of another option. - -- Notes for [experiments tracking with MLflow](NOTES_MLflow.md) -- Notes for [experiments tracking with Polyaxon](NOTES_Polyaxon.md) -- Notes for [experiments tracking with ClearML](NOTES_ClearML.md) +| Model | Training Top-1 Accuracy | Training Top-5 Accuracy | Test Top-1 Accuracy | Test Top-5 Accuracy | +| --------- | ----------------------- | ----------------------- | ------------------- | ------------------- | +| ResNet-50 | 78% | 92% | 77% | 94% | -## Implementation details +Experiment | Model | Training Top-1 Accuracy | Training Top-5 Accuracy | Test Top-1 Accuracy | Test Top-5 Accuracy | ClearML Link +---|---|---|---|---|---|--- +configs/???.py | -Files tree description: +## Setup ``` -code - |___ dataflow : module privides data loaders and various transformers - |___ scripts : executable training script - |___ utils : other helper modules - -configs - |___ train : training python configuration files - -experiments - |___ mlflow : MLflow related files - |___ plx : Polyaxon related files - |___ clearml : requirements.txt to install ClearML python package - -notebooks : jupyter notebooks to check specific parts from code modules +pip install -r requirements.txt ``` -## Code and configs - -### [py_config_runner](https://github.com/vfdev-5/py_config_runner) +### Docker -We use [py_config_runner](https://github.com/vfdev-5/py_config_runner) package to execute python scripts with python configuration files. +For docker users, you can use the following images to run the example: +```bash +docker pull pytorchignite/vision:latest +``` -### Training script +and install other requirements as suggested above -Training script is located [code/scripts](code/scripts/) and contains +## Usage -- `training.py`, single training script with possiblity to use one of MLflow / Polayaxon / ClearML experiments tracking systems. +Please, export the `DATASET_PATH` environment variable for the ImageNet dataset. -Training script contains `run` method required by [py_config_runner](https://github.com/vfdev-5/py_config_runner) to -run a script with a configuration. +```bash +export DATASET_PATH=/path/to/imagenet +# e.g. export DATASET_PATH=/data/ where "train", "val", "meta.bin" are located +``` -The split between training script and configuration python file is the following. -Configuration file being a python script defines necessary components for neural network training: +### Training -- Dataflow: training/validation/train evaluation data loaders with custom data augmentations -- Model -- Optimizer -- Criterion -- LR scheduler -- other parameters: device, number of epochs, etc +#### Single GPU -Training script uses these components to setup and run training and validation loops. By default, -processing group with "nccl" backend is initialized for distributed configuration (even for a single GPU). +- Adjust batch size for your GPU type in the configuration file: `configs/baseline_resnet50.py` or `configs/baseline_resnet50.py` -Training script is generic, uses [`ignite.distributed` API](https://pytorch.org/ignite/master/distributed.html), and adapts -training components to provided distributed configuration (e.g. uses DistribtedDataParallel model wrapper, -uses distributed sampling, scales batch size etc). +Run the following command: +```bash +CUDA_VISIBLE_DEVICES=0 python -u main.py training configs/baseline_resnet50.py +``` +#### Multiple GPUs -### Configurations +- Adjust total batch size for your GPUs in the configuration file: `configs/baseline_resnet50.py` or `configs/baseline_resnet50.py` -- [baseline_resnet50.py](configs/train/baseline_resnet50.py) : trains ResNet50 +```bash +OMP_NUM_THREADS=1 torchrun --nproc_per_node=2 main.py training configs/baseline_resnet50.py +``` -### Results - -| Model | Training Top-1 Accuracy | Training Top-5 Accuracy | Test Top-1 Accuracy | Test Top-5 Accuracy | -| --------- | ----------------------- | ----------------------- | ------------------- | ------------------- | -| ResNet-50 | 78% | 92% | 77% | 94% | ## Acknowledgements -Part of trainings was done within [Tesla GPU Test Drive](https://www.nvidia.com/en-us/data-center/tesla/gpu-test-drive/) -on 2 Nvidia V100 GPUs. - -![tb_dashboard_images](assets/tb_dashboard_images.png) +Trainings were done using credits provided by [trainml.ai](trainml.ai) platform. diff --git a/examples/references/classification/imagenet/assets/tb_dashboard.png b/examples/references/classification/imagenet/assets/tb_dashboard.png deleted file mode 100644 index 63ccf73e5325..000000000000 Binary files a/examples/references/classification/imagenet/assets/tb_dashboard.png and /dev/null differ diff --git a/examples/references/classification/imagenet/assets/tb_dashboard_images.png b/examples/references/classification/imagenet/assets/tb_dashboard_images.png deleted file mode 100644 index 24ef188f7a8b..000000000000 Binary files a/examples/references/classification/imagenet/assets/tb_dashboard_images.png and /dev/null differ diff --git a/examples/references/classification/imagenet/code/dataflow/__init__.py b/examples/references/classification/imagenet/code/dataflow/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/examples/references/classification/imagenet/code/dataflow/dataloaders.py b/examples/references/classification/imagenet/code/dataflow/dataloaders.py deleted file mode 100644 index 8097b724d105..000000000000 --- a/examples/references/classification/imagenet/code/dataflow/dataloaders.py +++ /dev/null @@ -1,67 +0,0 @@ -from typing import Callable, Optional, Tuple - -import cv2 -import numpy as np -from torch.utils.data import DataLoader -from torch.utils.data.dataset import Subset -from torchvision.datasets import ImageNet - -import ignite.distributed as idist - - -def opencv_loader(path): - img = cv2.imread(path) - assert img is not None, f"Image at '{path}' has a problem" - return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) - - -def get_train_val_loaders( - root_path: str, - train_transforms: Callable, - val_transforms: Callable, - batch_size: int = 16, - num_workers: int = 8, - val_batch_size: Optional[int] = None, - limit_train_num_samples: Optional[int] = None, - limit_val_num_samples: Optional[int] = None, -) -> Tuple[DataLoader, DataLoader, DataLoader]: - - train_ds = ImageNet( - root_path, split="train", transform=lambda sample: train_transforms(image=sample)["image"], loader=opencv_loader - ) - val_ds = ImageNet( - root_path, split="val", transform=lambda sample: val_transforms(image=sample)["image"], loader=opencv_loader - ) - - if limit_train_num_samples is not None: - np.random.seed(limit_train_num_samples) - train_indices = np.random.permutation(len(train_ds))[:limit_train_num_samples] - train_ds = Subset(train_ds, train_indices) - - if limit_val_num_samples is not None: - np.random.seed(limit_val_num_samples) - val_indices = np.random.permutation(len(val_ds))[:limit_val_num_samples] - val_ds = Subset(val_ds, val_indices) - - # random samples for evaluation on training dataset - if len(val_ds) < len(train_ds): - np.random.seed(len(val_ds)) - train_eval_indices = np.random.permutation(len(train_ds))[: len(val_ds)] - train_eval_ds = Subset(train_ds, train_eval_indices) - else: - train_eval_ds = train_ds - - train_loader = idist.auto_dataloader( - train_ds, shuffle=True, batch_size=batch_size, num_workers=num_workers, drop_last=True - ) - - val_batch_size = batch_size * 4 if val_batch_size is None else val_batch_size - val_loader = idist.auto_dataloader( - val_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False - ) - - train_eval_loader = idist.auto_dataloader( - train_eval_ds, shuffle=False, batch_size=val_batch_size, num_workers=num_workers, drop_last=False - ) - - return train_loader, val_loader, train_eval_loader diff --git a/examples/references/classification/imagenet/code/dataflow/transforms.py b/examples/references/classification/imagenet/code/dataflow/transforms.py deleted file mode 100644 index 626eb87d1548..000000000000 --- a/examples/references/classification/imagenet/code/dataflow/transforms.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch - - -def denormalize(t, mean, std, max_pixel_value=255): - assert isinstance(t, torch.Tensor), f"{type(t)}" - assert t.ndim == 3 - d = t.device - mean = torch.tensor(mean, device=d).unsqueeze(-1).unsqueeze(-1) - std = torch.tensor(std, device=d).unsqueeze(-1).unsqueeze(-1) - tensor = std * t + mean - tensor *= max_pixel_value - return tensor diff --git a/examples/references/classification/imagenet/code/scripts/training.py b/examples/references/classification/imagenet/code/scripts/training.py deleted file mode 100644 index 5cdb46585b98..000000000000 --- a/examples/references/classification/imagenet/code/scripts/training.py +++ /dev/null @@ -1,331 +0,0 @@ -# This a training script launched with py_config_runner -# It should obligatory contain `run(config, **kwargs)` method - -from pathlib import Path - -import torch -from apex import amp -from py_config_runner.config_utils import assert_config, get_params, TRAINVAL_CONFIG -from py_config_runner.utils import set_seed -from utils import exp_tracking -from utils.handlers import predictions_gt_images_handler - -import ignite -import ignite.distributed as idist -from ignite.contrib.engines import common -from ignite.engine import _prepare_batch, create_supervised_evaluator, Engine, Events -from ignite.metrics import Accuracy, TopKCategoricalAccuracy -from ignite.utils import setup_logger - - -def initialize(config): - - model = config.model.to(config.device) - optimizer = config.optimizer - # Setup Nvidia/Apex AMP - model, optimizer = amp.initialize(model, optimizer, opt_level=getattr(config, "fp16_opt_level", "O2"), num_losses=1) - - # Adapt model to dist conf - model = idist.auto_model(model) - - criterion = config.criterion.to(config.device) - - return model, optimizer, criterion - - -def create_trainer(model, optimizer, criterion, train_sampler, config, logger): - prepare_batch = config.prepare_batch - device = config.device - - # Setup trainer - accumulation_steps = getattr(config, "accumulation_steps", 1) - model_output_transform = getattr(config, "model_output_transform", lambda x: x) - - def train_update_function(engine, batch): - - model.train() - - x, y = prepare_batch(batch, device=device, non_blocking=True) - y_pred = model(x) - y_pred = model_output_transform(y_pred) - loss = criterion(y_pred, y) / accumulation_steps - - with amp.scale_loss(loss, optimizer, loss_id=0) as scaled_loss: - scaled_loss.backward() - - if engine.state.iteration % accumulation_steps == 0: - optimizer.step() - optimizer.zero_grad() - - return { - "supervised batch loss": loss.item(), - } - - output_names = getattr(config, "output_names", ["supervised batch loss"]) - lr_scheduler = config.lr_scheduler - - trainer = Engine(train_update_function) - trainer.logger = logger - - to_save = {"model": model, "optimizer": optimizer, "lr_scheduler": lr_scheduler, "trainer": trainer, "amp": amp} - - save_every_iters = getattr(config, "save_every_iters", 1000) - - common.setup_common_training_handlers( - trainer, - train_sampler, - to_save=to_save, - save_every_iters=save_every_iters, - output_path=config.output_path.as_posix(), - lr_scheduler=lr_scheduler, - with_gpu_stats=True, - output_names=output_names, - with_pbars=False, - ) - - common.ProgressBar(persist=False).attach(trainer, metric_names="all") - - return trainer - - -def create_evaluators(model, metrics, config): - model_output_transform = getattr(config, "model_output_transform", lambda x: x) - - evaluator_args = dict( - model=model, - metrics=metrics, - device=config.device, - non_blocking=True, - prepare_batch=config.prepare_batch, - output_transform=lambda x, y, y_pred: (model_output_transform(y_pred), y), - ) - train_evaluator = create_supervised_evaluator(**evaluator_args) - evaluator = create_supervised_evaluator(**evaluator_args) - - common.ProgressBar(persist=False).attach(train_evaluator) - common.ProgressBar(persist=False).attach(evaluator) - - return evaluator, train_evaluator - - -def log_metrics(logger, epoch, elapsed, tag, metrics): - metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) - logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed} - {tag} metrics:\n {metrics_output}") - - -def log_basic_info(logger, config): - - msg = f"\n- PyTorch version: {torch.__version__}" - msg += f"\n- Ignite version: {ignite.__version__}" - logger.info(msg) - - if idist.get_world_size() > 1: - msg = "\nDistributed setting:" - msg += f"\tbackend: {idist.backend()}" - msg += f"\trank: {idist.get_rank()}" - msg += f"\tworld size: {idist.get_world_size()}" - logger.info(msg) - - -def training(local_rank, config, logger=None): - - if not getattr(config, "use_fp16", True): - raise RuntimeError("This training script uses by default fp16 AMP") - - torch.backends.cudnn.benchmark = True - - set_seed(config.seed + local_rank) - - train_loader, val_loader, train_eval_loader = config.train_loader, config.val_loader, config.train_eval_loader - - # Setup model, optimizer, criterion - model, optimizer, criterion = initialize(config) - - if not hasattr(config, "prepare_batch"): - config.prepare_batch = _prepare_batch - - # Setup trainer for this specific task - trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger) - - if getattr(config, "benchmark_dataflow", False): - benchmark_dataflow_num_iters = getattr(config, "benchmark_dataflow_num_iters", 1000) - DataflowBenchmark(benchmark_dataflow_num_iters, prepare_batch=config.prepare_batch).attach( - trainer, train_loader - ) - - # Setup evaluators - val_metrics = { - "Accuracy": Accuracy(), - "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), - } - - if hasattr(config, "val_metrics") and isinstance(config.val_metrics, dict): - val_metrics.update(config.val_metrics) - - evaluator, train_evaluator = create_evaluators(model, val_metrics, config) - - @trainer.on(Events.EPOCH_COMPLETED(every=getattr(config, "val_interval", 1)) | Events.COMPLETED) - def run_validation(): - epoch = trainer.state.epoch - state = train_evaluator.run(train_eval_loader) - log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) - state = evaluator.run(val_loader) - log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) - - if getattr(config, "start_by_validation", False): - trainer.add_event_handler(Events.STARTED, run_validation) - - score_metric_name = "Accuracy" - - if hasattr(config, "es_patience"): - common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) - - # Store 3 best models by validation accuracy: - common.save_best_model_by_val_score( - config.output_path.as_posix(), - evaluator, - model=model, - metric_name=score_metric_name, - n_saved=3, - trainer=trainer, - tag="val", - ) - - if idist.get_rank() == 0: - - tb_logger = common.setup_tb_logging( - config.output_path.as_posix(), - trainer, - optimizer, - evaluators={"training": train_evaluator, "validation": evaluator}, - ) - - exp_tracking_logger = exp_tracking.setup_logging( - trainer, optimizer, evaluators={"training": train_evaluator, "validation": evaluator} - ) - - # Log train/val predictions: - tb_logger.attach( - evaluator, - log_handler=predictions_gt_images_handler( - img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation" - ), - event_name=Events.ITERATION_COMPLETED(once=len(val_loader) // 2), - ) - - tb_logger.attach( - train_evaluator, - log_handler=predictions_gt_images_handler( - img_denormalize_fn=config.img_denormalize, n_images=15, another_engine=trainer, prefix_tag="training" - ), - event_name=Events.ITERATION_COMPLETED(once=len(train_eval_loader) // 2), - ) - - trainer.run(train_loader, max_epochs=config.num_epochs) - - if idist.get_rank() == 0: - tb_logger.close() - exp_tracking_logger.close() - - -def run(config, **kwargs): - """This is the main method to run the training. As this training script is launched with `py_config_runner` - it should obligatory contain `run(config, **kwargs)` method. - - """ - - assert torch.cuda.is_available(), torch.cuda.is_available() - assert torch.backends.cudnn.enabled, "Nvidia/Amp requires cudnn backend to be enabled." - - with idist.Parallel(backend="nccl") as parallel: - - logger = setup_logger(name="ImageNet Training", distributed_rank=idist.get_rank()) - - assert_config(config, TRAINVAL_CONFIG) - # The following attributes are automatically added by py_config_runner - assert hasattr(config, "config_filepath") and isinstance(config.config_filepath, Path) - assert hasattr(config, "script_filepath") and isinstance(config.script_filepath, Path) - - if idist.get_rank() == 0 and exp_tracking.has_clearml: - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task - - task = Task.init("ImageNet Training", config.config_filepath.stem) - task.connect_configuration(config.config_filepath.as_posix()) - - log_basic_info(logger, config) - - config.output_path = Path(exp_tracking.get_output_path()) - # dump python files to reproduce the run - exp_tracking.log_artifact(config.config_filepath.as_posix()) - exp_tracking.log_artifact(config.script_filepath.as_posix()) - exp_tracking.log_params(get_params(config, TRAINVAL_CONFIG)) - - try: - parallel.run(training, config, logger=logger) - except KeyboardInterrupt: - logger.info("Catched KeyboardInterrupt -> exit") - except Exception as e: # noqa - logger.exception("") - raise e - - -class DataflowBenchmark: - def __init__(self, num_iters=100, prepare_batch=None): - - from ignite.handlers import Timer - - device = idist.device() - - def upload_to_gpu(engine, batch): - if prepare_batch is not None: - x, y = prepare_batch(batch, device=device, non_blocking=False) - - self.num_iters = num_iters - self.benchmark_dataflow = Engine(upload_to_gpu) - - @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(once=num_iters)) - def stop_benchmark_dataflow(engine): - engine.terminate() - - if idist.get_rank() == 0: - - @self.benchmark_dataflow.on(Events.ITERATION_COMPLETED(every=num_iters // 100)) - def show_progress_benchmark_dataflow(engine): - print(".", end=" ") - - self.timer = Timer(average=False) - self.timer.attach( - self.benchmark_dataflow, - start=Events.EPOCH_STARTED, - resume=Events.ITERATION_STARTED, - pause=Events.ITERATION_COMPLETED, - step=Events.ITERATION_COMPLETED, - ) - - def attach(self, trainer, train_loader): - - from torch.utils.data import DataLoader - - @trainer.on(Events.STARTED) - def run_benchmark(_): - if idist.get_rank() == 0: - print("-" * 50) - print(" - Dataflow benchmark") - - self.benchmark_dataflow.run(train_loader) - t = self.timer.value() - - if idist.get_rank() == 0: - print(" ") - print(f" Total time ({self.num_iters} iterations) : {t:.5f} seconds") - print(f" time per iteration : {t / self.num_iters} seconds") - - if isinstance(train_loader, DataLoader): - num_images = train_loader.batch_size * self.num_iters - print(f" number of images / s : {num_images / t}") - - print("-" * 50) diff --git a/examples/references/classification/imagenet/code/utils/__init__.py b/examples/references/classification/imagenet/code/utils/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/examples/references/classification/imagenet/code/utils/exp_tracking.py b/examples/references/classification/imagenet/code/utils/exp_tracking.py deleted file mode 100644 index d1509848015d..000000000000 --- a/examples/references/classification/imagenet/code/utils/exp_tracking.py +++ /dev/null @@ -1,148 +0,0 @@ -# Module for common exp tracking methods - -import os -from pathlib import Path - -import torch - -import ignite -import ignite.distributed as idist -from ignite.contrib.engines import common - -try: - import polyaxon.tracking # noqa: F401 - - has_plx = True -except ImportError: - try: - import polyaxon_client.tracking # noqa: F401 - - if "POLYAXON_RUN_OUTPUTS_PATH" not in os.environ: - raise ImportError("Not in Polyaxon cluster") - - has_plx = True - except ImportError: - has_plx = False - -try: - import mlflow - - if "MLFLOW_TRACKING_URI" not in os.environ: - raise ImportError("MLFLOW_TRACKING_URI should be defined") - - has_mlflow = True -except ImportError: - has_mlflow = False - - -try: - try: - import clearml # noqa: F401 - except ImportError: - import trains # noqa: F401 - - if "CLEARML_OUTPUT_PATH" not in os.environ: - raise ImportError("CLEARML_OUTPUT_PATH should be defined") - - has_clearml = True - clearml_output_path = None -except ImportError: - has_clearml = False - - -def _plx_get_output_path(): - from polyaxon_client.tracking import get_outputs_path - - return get_outputs_path() - - -@idist.one_rank_only() -def _plx_log_artifact(fp): - from polyaxon_client.tracking import Experiment - - plx_exp = Experiment() - plx_exp.log_artifact(fp) - - -@idist.one_rank_only() -def _plx_log_params(params_dict): - from polyaxon_client.tracking import Experiment - - plx_exp = Experiment() - plx_exp.log_inputs(**{"pytorch version": torch.__version__, "ignite version": ignite.__version__}) - plx_exp.log_inputs(**params_dict) - - -def _mlflow_get_output_path(): - return mlflow.get_artifact_uri() - - -@idist.one_rank_only() -def _mlflow_log_artifact(fp): - mlflow.log_artifact(fp) - - -@idist.one_rank_only() -def _mlflow_log_params(params_dict): - mlflow.log_params({"pytorch version": torch.__version__, "ignite version": ignite.__version__}) - mlflow.log_params(params_dict) - - -def _clearml_get_output_path(): - global clearml_output_path - - if clearml_output_path is None: - from datetime import datetime - - output_path = Path(os.environ["CLEARML_OUTPUT_PATH"]) - output_path = output_path / "clearml" / datetime.now().strftime("%Y%m%d-%H%M%S") - clearml_output_path = output_path - - return clearml_output_path.as_posix() - - -@idist.one_rank_only() -def _clearml_log_artifact(fp): - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task - - task = Task.current_task() - task.upload_artifact(Path(fp).name, fp) - - -@idist.one_rank_only() -def _clearml_log_params(params_dict): - try: - from clearml import Task - except ImportError: - # Backwards-compatibility for legacy Trains SDK - from trains import Task - - task = Task.current_task() - task.connect(params_dict) - - -if has_plx: - get_output_path = _plx_get_output_path - log_params = _plx_log_params - setup_logging = common.setup_plx_logging - log_artifact = _plx_log_artifact -elif has_mlflow: - get_output_path = _mlflow_get_output_path - log_params = _mlflow_log_params - setup_logging = common.setup_mlflow_logging - log_artifact = _mlflow_log_artifact -elif has_clearml: - get_output_path = _clearml_get_output_path - log_params = _clearml_log_params - setup_logging = common.setup_clearml_logging - log_artifact = _clearml_log_artifact -else: - raise RuntimeError( - "No experiment tracking system is setup. " - "Please, setup either MLflow, Polyaxon or ClearML. " - "For more details see NOTES_*.md" - ) diff --git a/examples/references/classification/imagenet/code/utils/handlers.py b/examples/references/classification/imagenet/code/utils/handlers.py deleted file mode 100644 index b64b027e4b11..000000000000 --- a/examples/references/classification/imagenet/code/utils/handlers.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch -from dataflow.vis import make_grid - - -def predictions_gt_images_handler(img_denormalize_fn, n_images=None, another_engine=None, prefix_tag=None): - def wrapper(engine, logger, event_name): - batch = engine.state.batch - output = engine.state.output - x, y = batch - y_pred = output[0] - - if y.shape == y_pred.shape and y.ndim == 4: - # Case of y of shape (B, C, H, W) - y = torch.argmax(y, dim=1) - - y_pred = torch.argmax(y_pred, dim=1).byte() - - if n_images is not None: - x = x[:n_images, ...] - y = y[:n_images, ...] - y_pred = y_pred[:n_images, ...] - - grid_pred_gt = make_grid(x, y_pred, img_denormalize_fn, batch_gt=y) - - state = engine.state if another_engine is None else another_engine.state - global_step = state.get_event_attrib_value(event_name) - - tag = "predictions_with_gt" - if prefix_tag is not None: - tag = f"{prefix_tag}: {tag}" - logger.writer.add_image(tag=tag, img_tensor=grid_pred_gt, global_step=global_step, dataformats="HWC") - - return wrapper diff --git a/examples/references/classification/imagenet/configs/train/baseline_resnet50.py b/examples/references/classification/imagenet/configs/baseline_resnet50.py similarity index 91% rename from examples/references/classification/imagenet/configs/train/baseline_resnet50.py rename to examples/references/classification/imagenet/configs/baseline_resnet50.py index f141d35f3f79..d1520c46db9e 100644 --- a/examples/references/classification/imagenet/configs/train/baseline_resnet50.py +++ b/examples/references/classification/imagenet/configs/baseline_resnet50.py @@ -7,8 +7,7 @@ import torch.optim as optim import torch.optim.lr_scheduler as lrs from albumentations.pytorch import ToTensorV2 as ToTensor -from dataflow.dataloaders import get_train_val_loaders -from dataflow.transforms import denormalize +from dataflow import denormalize, get_train_val_loaders from torchvision.models.resnet import resnet50 import ignite.distributed as idist @@ -25,14 +24,12 @@ benchmark_dataflow = True benchmark_dataflow_num_iters = 100 -fp16_opt_level = "O2" -val_interval = 2 - train_crop_size = 224 val_crop_size = 320 batch_size = 64 * idist.get_world_size() # total batch size -num_workers = 10 +num_workers = 8 +val_interval = 2 # ############################## @@ -73,6 +70,8 @@ batch_size=batch_size, num_workers=num_workers, val_batch_size=batch_size, + limit_train_num_samples=batch_size * 6 if debug else None, + limit_val_num_samples=batch_size * 6 if debug else None, ) # Image denormalization function to plot predictions with images @@ -82,7 +81,7 @@ # Setup Model # ############################## -model = resnet50(pretrained=False) +model = resnet50(weights=None) # ############################## diff --git a/examples/references/classification/imagenet/configs/train/check_baseline_resnet50.py b/examples/references/classification/imagenet/configs/check_baseline_resnet50.py similarity index 90% rename from examples/references/classification/imagenet/configs/train/check_baseline_resnet50.py rename to examples/references/classification/imagenet/configs/check_baseline_resnet50.py index f08fe8dce25d..e3a0e386f83d 100644 --- a/examples/references/classification/imagenet/configs/train/check_baseline_resnet50.py +++ b/examples/references/classification/imagenet/configs/check_baseline_resnet50.py @@ -7,8 +7,7 @@ import torch.optim as optim import torch.optim.lr_scheduler as lrs from albumentations.pytorch import ToTensorV2 as ToTensor -from dataflow.dataloaders import get_train_val_loaders -from dataflow.transforms import denormalize +from dataflow import denormalize, get_train_val_loaders from torchvision.models.resnet import resnet50 import ignite.distributed as idist @@ -19,21 +18,19 @@ seed = 19 device = "cuda" -debug = False +debug = True # config to measure time passed to prepare batches and report measured time before the training benchmark_dataflow = True benchmark_dataflow_num_iters = 100 -fp16_opt_level = "O2" -val_interval = 2 -start_by_validation = True - train_crop_size = 224 val_crop_size = 320 batch_size = 64 * idist.get_world_size() # total batch size -num_workers = 10 +num_workers = 8 +val_interval = 2 +start_by_validation = True # ############################## @@ -74,6 +71,8 @@ batch_size=batch_size, num_workers=num_workers, val_batch_size=batch_size, + limit_train_num_samples=batch_size * 6 if debug else None, + limit_val_num_samples=batch_size * 6 if debug else None, ) # Image denormalization function to plot predictions with images @@ -83,14 +82,14 @@ # Setup Model # ############################## -model = resnet50(pretrained=False) +model = resnet50(weights=None) # ############################## # Setup Solver # ############################## -num_epochs = 1 +num_epochs = 2 criterion = nn.CrossEntropyLoss() diff --git a/examples/references/classification/imagenet/dataflow.py b/examples/references/classification/imagenet/dataflow.py new file mode 100644 index 000000000000..4d422d9e26a0 --- /dev/null +++ b/examples/references/classification/imagenet/dataflow.py @@ -0,0 +1,105 @@ +from pathlib import Path +from typing import Callable, Optional, Tuple + +import cv2 + +import torch +from torch.utils.data import DataLoader +from torch.utils.data.dataset import Subset +from torchvision.datasets import ImageFolder + +import ignite.distributed as idist +from ignite.utils import convert_tensor + + +def opencv_loader(path): + img = cv2.imread(path) + assert img is not None, f"Image at '{path}' has a problem" + return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + +def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs): + + if limit_num_samples is not None: + g = torch.Generator().manual_seed(limit_num_samples) + indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples] + dataset = Subset(dataset, indices) + + return idist.auto_dataloader(dataset, sampler=sampler, shuffle=(sampler is None) and shuffle, **kwargs) + + +def get_train_val_loaders( + root_path: str, + train_transforms: Callable, + val_transforms: Callable, + batch_size: int = 16, + num_workers: int = 8, + val_batch_size: Optional[int] = None, + limit_train_num_samples: Optional[int] = None, + limit_val_num_samples: Optional[int] = None, +) -> Tuple[DataLoader, DataLoader, DataLoader]: + + train_ds = ImageFolder( + Path(root_path) / "train", + transform=lambda sample: train_transforms(image=sample)["image"], + loader=opencv_loader, + ) + val_ds = ImageFolder( + Path(root_path) / "val", transform=lambda sample: val_transforms(image=sample)["image"], loader=opencv_loader + ) + + if len(val_ds) < len(train_ds): + g = torch.Generator().manual_seed(len(train_ds)) + train_eval_indices = torch.randperm(len(train_ds), generator=g)[: len(val_ds)] + train_eval_ds = Subset(train_ds, train_eval_indices) + else: + train_eval_ds = train_ds + + val_batch_size = batch_size * 4 if val_batch_size is None else val_batch_size + + train_loader = get_dataloader( + train_ds, + shuffle=True, + batch_size=batch_size, + num_workers=num_workers, + drop_last=True, + limit_num_samples=limit_train_num_samples, + ) + + val_loader = get_dataloader( + val_ds, + shuffle=False, + batch_size=val_batch_size, + num_workers=num_workers, + drop_last=False, + limit_num_samples=limit_val_num_samples, + ) + + train_eval_loader = get_dataloader( + train_eval_ds, + shuffle=False, + batch_size=val_batch_size, + num_workers=num_workers, + drop_last=False, + limit_num_samples=limit_val_num_samples, + ) + + return train_loader, val_loader, train_eval_loader + + +def denormalize(t, mean, std, max_pixel_value=255): + assert isinstance(t, torch.Tensor), f"{type(t)}" + assert t.ndim == 3 + d = t.device + mean = torch.tensor(mean, device=d).unsqueeze(-1).unsqueeze(-1) + std = torch.tensor(std, device=d).unsqueeze(-1).unsqueeze(-1) + tensor = std * t + mean + tensor *= max_pixel_value + return tensor + + +def prepare_batch(batch, device, non_blocking): + x, y = batch[0], batch[1] + x = convert_tensor(x, device, non_blocking=non_blocking) + y = convert_tensor(y, device, non_blocking=non_blocking) + return x, y diff --git a/examples/references/classification/imagenet/experiments/clearml/requirements.txt b/examples/references/classification/imagenet/experiments/clearml/requirements.txt deleted file mode 100644 index 0230566c6048..000000000000 --- a/examples/references/classification/imagenet/experiments/clearml/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -clearml diff --git a/examples/references/classification/imagenet/experiments/mlflow/MLproject b/examples/references/classification/imagenet/experiments/mlflow/MLproject deleted file mode 100644 index 7a85b3a9a42d..000000000000 --- a/examples/references/classification/imagenet/experiments/mlflow/MLproject +++ /dev/null @@ -1,11 +0,0 @@ -name: ImageNet - -conda_env: conda.yaml - -entry_points: - - main: - parameters: - config_path: path - num_gpus: float - command: "sh ../setup_apex.sh && export PYTHONPATH=$PWD/../../code:$PYTHONPATH && python -m torch.distributed.launch --nproc_per_node={num_gpus} --use_env -m py_config_runner ../../code/scripts/training.py {config_path}" diff --git a/examples/references/classification/imagenet/experiments/mlflow/conda.yaml b/examples/references/classification/imagenet/experiments/mlflow/conda.yaml deleted file mode 100644 index 212ec199d591..000000000000 --- a/examples/references/classification/imagenet/experiments/mlflow/conda.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: imagenet_env -channels: - - pytorch -dependencies: - - python=3.7 - - numpy - - cudatoolkit - - pytorch - - torchvision - - pip - - pip: - - mlflow - - albumentations - - tqdm - - tensorboardX - - py_config_runner - - pynvml - - pytorch-ignite - - git+https://github.com/vfdev-5/ImageDatasetViz.git diff --git a/examples/references/classification/imagenet/experiments/plx/xp_training.yml.tmpl b/examples/references/classification/imagenet/experiments/plx/xp_training.yml.tmpl deleted file mode 100644 index 590fe4ae0049..000000000000 --- a/examples/references/classification/imagenet/experiments/plx/xp_training.yml.tmpl +++ /dev/null @@ -1,57 +0,0 @@ ---- -version: 1 -kind: experiment - -tags: ["training", "resnet50"] - -# Setup running node: -environment: - node_selector: - polyaxon: multigpu - resources: - gpu: - requests: 2 - limits: 2 - -# Setup running environment: -build: - image: pytorch/pytorch:1.5-cuda10.1-cudnn7-devel - build_steps: - - # For opencv - - apt-get update && - TZ=America/New_York DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata && - apt-get -y install --no-install-recommends libglib2.0 libsm6 libxext6 libxrender-dev git - - # Install Nvidia/APEX - - git clone https://github.com/NVIDIA/apex /tmp/apex && cd /tmp/apex && - export TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5" && - pip install --upgrade --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . - - # Install ignite and other useful packages - - pip install --upgrade --pre pytorch-ignite && - pip install git+https://github.com/vfdev-5/ImageDatasetViz.git albumentations && - pip install polyaxon-client tqdm tensorboardX py_config_runner pynvml - - -declarations: - config_file: "baseline_resnet50.py" - script_file: "training.py" - num_gpus: 2 - -run: - cmd: - - export LC_ALL=C.UTF-8 && export LANG=C.UTF-8 - - export PYTHONPATH=$PYTHONPATH:$PWD/code/ - - # Required env variables to define dataset placement - - export DATASET_PATH=/path/to/ImageNet-1k/ - - - export config_file=$PWD/configs/train/{{config_file}} - - export script_file=$PWD/code/scripts/{{script_file}} - - # Copy configuration file to the output - - cp $config_file $POLYAXON_RUN_OUTPUTS_PATH - - cp $script_file $POLYAXON_RUN_OUTPUTS_PATH - - - python -m torch.distributed.launch --nproc_per_node={{num_gpus}} -m py_config_runner $script_file $config_file \ No newline at end of file diff --git a/examples/references/classification/imagenet/experiments/setup_apex.sh b/examples/references/classification/imagenet/experiments/setup_apex.sh deleted file mode 100644 index e4c5bf6b2b31..000000000000 --- a/examples/references/classification/imagenet/experiments/setup_apex.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - - -tmp_apex_path="/tmp/apex" - -python -c "import apex" -res=$? - -if [ "$res" -eq "1" ]; then - - echo "Setup NVIDIA Apex" - rm -rf $tmp_apex_path - git clone https://github.com/NVIDIA/apex $tmp_apex_path - cd $tmp_apex_path - export TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5" - pip install --upgrade --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . - -fi diff --git a/examples/references/classification/imagenet/experiments/setup_opencv.sh b/examples/references/classification/imagenet/experiments/setup_opencv.sh deleted file mode 100644 index 94d6da9f2868..000000000000 --- a/examples/references/classification/imagenet/experiments/setup_opencv.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - - -python -c "import cv2" -res=$? - -if [ "$res" -eq "1" ]; then - echo "Install libglib2.0 libsm6 libxext6 libxrender-dev for opencv" - apt-get update - ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime - apt-get install -y tzdata - dpkg-reconfigure --frontend noninteractive tzdata - apt-get -y install --no-install-recommends libglib2.0 libsm6 libxext6 libxrender-dev -fi diff --git a/examples/references/classification/imagenet/main.py b/examples/references/classification/imagenet/main.py new file mode 100644 index 000000000000..8a001260d972 --- /dev/null +++ b/examples/references/classification/imagenet/main.py @@ -0,0 +1,432 @@ +import os +from functools import partial +from pathlib import Path + +import fire +import torch + +try: + from torch.cuda.amp import autocast, GradScaler +except ImportError: + raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0") + +import dataflow as data +import utils +import vis +from py_config_runner import ConfigObject, get_params, InferenceConfigSchema, TrainvalConfigSchema + +import ignite.distributed as idist +from ignite.contrib.engines import common +from ignite.engine import Engine, Events +from ignite.handlers import Checkpoint, Timer +from ignite.metrics import Accuracy, Frequency, TopKCategoricalAccuracy +from ignite.utils import manual_seed, setup_logger + + +def training(local_rank, config, logger, with_clearml): + + rank = idist.get_rank() + manual_seed(config.seed + local_rank) + + train_loader = config.train_loader + val_loader = config.val_loader + train_eval_loader = config.train_eval_loader + + model, optimizer, criterion = utils.initialize(config) + + # Setup trainer for this specific task + trainer = create_trainer(model, optimizer, criterion, train_loader.sampler, config, logger, with_clearml) + + # Setup evaluators + accuracy = Accuracy() + val_metrics = { + "Accuracy": accuracy, + "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), + "Error": (1.0 - accuracy) * 100, + } + + if ("val_metrics" in config) and isinstance(config.val_metrics, dict): + val_metrics.update(config.val_metrics) + + evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") + train_evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="train") + + val_interval = config.get("val_interval", 1) + + # Run validation on every val_interval epoch, in the end of the training + # and in the begining if config.start_by_validation is True + event = Events.EPOCH_COMPLETED(every=val_interval) + if config.num_epochs % val_interval != 0: + event |= Events.COMPLETED + if config.get("start_by_validation", False): + event |= Events.STARTED + + @trainer.on(event) + def run_validation(): + epoch = trainer.state.epoch + state = train_evaluator.run(train_eval_loader) + utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Train", state.metrics) + state = evaluator.run(val_loader) + utils.log_metrics(logger, epoch, state.times["COMPLETED"], "Test", state.metrics) + + score_metric_name = "Accuracy" + if "es_patience" in config: + common.add_early_stopping_by_val_score(config.es_patience, evaluator, trainer, metric_name=score_metric_name) + + # Store 2 best models by validation accuracy: + common.gen_save_best_models_by_val_score( + save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml), + evaluator=evaluator, + models=model, + metric_name=score_metric_name, + n_saved=2, + trainer=trainer, + tag="val", + ) + + # Setup Tensorboard logger + if rank == 0: + tb_logger = common.setup_tb_logging( + config.output_path.as_posix(), + trainer, + optimizer, + evaluators={"training": train_evaluator, "validation": evaluator}, + ) + + # Log validation predictions as images + # We define a custom event filter to log less frequently the images (to reduce storage size) + # - we plot images with masks of the middle validation batch + # - once every 3 validations and + # - at the end of the training + def custom_event_filter(_, val_iteration): + c1 = val_iteration == 1 + c2 = trainer.state.epoch % (config.get("val_interval", 1) * 3) == 0 + c2 |= trainer.state.epoch == config.num_epochs + return c1 and c2 + + # Image denormalization function to plot predictions with images + mean = config.get("mean", (0.485, 0.456, 0.406)) + std = config.get("std", (0.229, 0.224, 0.225)) + img_denormalize = partial(data.denormalize, mean=mean, std=std) + + tb_logger.attach( + evaluator, + log_handler=vis.predictions_gt_images_handler( + img_denormalize_fn=img_denormalize, n_images=12, another_engine=trainer, prefix_tag="validation" + ), + event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), + ) + + tb_logger.attach( + train_evaluator, + log_handler=vis.predictions_gt_images_handler( + img_denormalize_fn=img_denormalize, n_images=12, another_engine=trainer, prefix_tag="training" + ), + event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), + ) + + trainer.run(train_loader, max_epochs=config.num_epochs) + + if idist.get_rank() == 0: + tb_logger.close() + + +def create_trainer(model, optimizer, criterion, train_sampler, config, logger, with_clearml): + device = config.device + prepare_batch = data.prepare_batch + + # Setup trainer + accumulation_steps = config.get("accumulation_steps", 1) + model_output_transform = config.get("model_output_transform", lambda x: x) + + with_amp = config.get("with_amp", True) + scaler = GradScaler(enabled=with_amp) + + def training_step(engine, batch): + model.train() + x, y = prepare_batch(batch, device=device, non_blocking=True) + with autocast(enabled=with_amp): + y_pred = model(x) + y_pred = model_output_transform(y_pred) + loss = criterion(y_pred, y) / accumulation_steps + + output = {"supervised batch loss": loss.item(), "num_samples": len(x)} + + scaler.scale(loss).backward() + if engine.state.iteration % accumulation_steps == 0: + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + + return output + + trainer = Engine(training_step) + trainer.logger = logger + + throughput_metric = Frequency(output_transform=lambda x: x["num_samples"]) + throughput_metric.attach(trainer, name="Throughput") + + timer = Timer(average=True) + timer.attach( + trainer, + resume=Events.ITERATION_STARTED, + pause=Events.ITERATION_COMPLETED, + step=Events.ITERATION_COMPLETED, + ) + + @trainer.on(Events.ITERATION_COMPLETED(every=20)) + def log_progress(): + metrics = dict(trainer.state.metrics) + epoch_length = trainer.state.epoch_length + + metrics["ETA (seconds)"] = int((epoch_length - (trainer.state.iteration % epoch_length)) * timer.value()) + + metrics_str = ", ".join([f"{k}: {v}" for k, v in metrics.items()]) + metrics_format = ( + f"[{trainer.state.epoch}/{trainer.state.max_epochs}] " + + f"Iter={trainer.state.iteration % epoch_length}/{epoch_length}: " + + f"{metrics_str}" + ) + trainer.logger.info(metrics_format) + + output_names = [ + "supervised batch loss", + ] + lr_scheduler = config.lr_scheduler + + to_save = { + "model": model, + "optimizer": optimizer, + "lr_scheduler": lr_scheduler, + "trainer": trainer, + "amp": scaler, + } + + save_every_iters = config.get("save_every_iters", 1000) + + common.setup_common_training_handlers( + trainer, + train_sampler, + to_save=to_save, + save_every_iters=save_every_iters, + save_handler=utils.get_save_handler(config.output_path.as_posix(), with_clearml), + lr_scheduler=lr_scheduler, + output_names=output_names, + # with_pbars=not with_clearml, + with_pbars=False, + log_every_iters=1, + ) + + resume_from = config.get("resume_from", None) + if resume_from is not None: + checkpoint_fp = Path(resume_from) + assert checkpoint_fp.exists(), f"Checkpoint '{checkpoint_fp.as_posix()}' is not found" + logger.info(f"Resume from a checkpoint: {checkpoint_fp.as_posix()}") + checkpoint = torch.load(checkpoint_fp.as_posix(), map_location="cpu") + Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint) + + return trainer + + +def create_evaluator(model, metrics, config, with_clearml, tag="val"): + model_output_transform = config.get("model_output_transform", lambda x: x) + with_amp = config.get("with_amp", True) + prepare_batch = data.prepare_batch + + @torch.no_grad() + def evaluate_step(engine, batch): + model.eval() + with autocast(enabled=with_amp): + x, y = prepare_batch(batch, device=config.device, non_blocking=True) + y_pred = model(x) + y_pred = model_output_transform(y_pred) + return y_pred, y + + evaluator = Engine(evaluate_step) + + for name, metric in metrics.items(): + metric.attach(evaluator, name) + + if idist.get_rank() == 0 and (not with_clearml): + common.ProgressBar(desc=f"Evaluation ({tag})", persist=False).attach(evaluator) + + return evaluator + + +def setup_experiment_tracking(config, with_clearml, task_type="training"): + from datetime import datetime + + assert task_type in ("training", "testing"), task_type + + output_path = "" + if idist.get_rank() == 0: + if with_clearml: + from clearml import Task + + schema = TrainvalConfigSchema if task_type == "training" else InferenceConfigSchema + + task = Task.init("ImageNet Training", config.config_filepath.stem, task_type=task_type) + task.connect_configuration(config.config_filepath.as_posix()) + + task.upload_artifact(config.script_filepath.name, config.script_filepath.as_posix()) + task.upload_artifact(config.config_filepath.name, config.config_filepath.as_posix()) + task.connect(get_params(config, schema)) + + output_path = Path(os.environ.get("CLEARML_OUTPUT_PATH", "/tmp")) + output_path = output_path / "clearml" / datetime.now().strftime("%Y%m%d-%H%M%S") + else: + import shutil + + output_path = Path(os.environ.get("OUTPUT_PATH", "/tmp/output-imagenet")) + output_path = output_path / task_type / config.config_filepath.stem + output_path = output_path / datetime.now().strftime("%Y%m%d-%H%M%S") + output_path.mkdir(parents=True, exist_ok=True) + + shutil.copyfile(config.script_filepath.as_posix(), output_path / config.script_filepath.name) + shutil.copyfile(config.config_filepath.as_posix(), output_path / config.config_filepath.name) + + output_path = output_path.as_posix() + return Path(idist.broadcast(output_path, src=0)) + + +def run_training(config_filepath, backend="nccl", with_clearml=True): + """Main entry to run training experiment + + Args: + config_filepath (str): training configuration .py file + backend (str): distributed backend: nccl, gloo or None to run without distributed config + with_clearml (bool): if True, uses ClearML as experiment tracking system + """ + assert torch.cuda.is_available(), torch.cuda.is_available() + assert torch.backends.cudnn.enabled + torch.backends.cudnn.benchmark = True + + config_filepath = Path(config_filepath) + assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" + + with idist.Parallel(backend=backend) as parallel: + + logger = setup_logger(name="ImageNet Training", distributed_rank=idist.get_rank()) + + config = ConfigObject(config_filepath) + TrainvalConfigSchema.validate(config) + config.script_filepath = Path(__file__) + + output_path = setup_experiment_tracking(config, with_clearml=with_clearml) + config.output_path = output_path + + utils.log_basic_info(logger, get_params(config, TrainvalConfigSchema)) + + try: + parallel.run(training, config, logger=logger, with_clearml=with_clearml) + except KeyboardInterrupt: + logger.info("Catched KeyboardInterrupt -> exit") + except Exception as e: # noqa + logger.exception("") + raise e + + +def get_model_weights(config, logger, with_clearml): + + path = "" + if with_clearml: + from clearml import Model + + if idist.get_rank() > 0: + idist.barrier() + else: + model_id = config.weights_path + + logger.info(f"Loading trained model: {model_id}") + model = Model(model_id) + assert model is not None, f"{model_id}" + path = model.get_local_copy() + idist.barrier() + path = idist.broadcast(path, src=0) + else: + path = config.weights_path + logger.info(f"Loading {path}") + + assert Path(path).exists(), f"{path} is not found" + return torch.load(path) + + +def evaluation(local_rank, config, logger, with_clearml): + + rank = idist.get_rank() + device = idist.device() + manual_seed(config.seed + local_rank) + + data_loader = config.data_loader + model = config.model.to(device) + + # Load weights: + state_dict = get_model_weights(config, logger, with_clearml) + model.load_state_dict(state_dict) + + # Adapt model to dist config + model = idist.auto_model(model) + + # Setup evaluators + val_metrics = { + "Accuracy": Accuracy(), + "Top-5 Accuracy": TopKCategoricalAccuracy(k=5), + } + + if ("val_metrics" in config) and isinstance(config.val_metrics, dict): + val_metrics.update(config.val_metrics) + + evaluator = create_evaluator(model, val_metrics, config, with_clearml, tag="val") + + # Setup Tensorboard logger + if rank == 0: + tb_logger = common.TensorboardLogger(log_dir=config.output_path.as_posix()) + tb_logger.attach_output_handler(evaluator, event_name=Events.COMPLETED, tag="validation", metric_names="all") + + state = evaluator.run(data_loader) + utils.log_metrics(logger, 0, state.times["COMPLETED"], "Validation", state.metrics) + + if idist.get_rank() == 0: + tb_logger.close() + + +def run_evaluation(config_filepath, backend="nccl", with_clearml=True): + """Main entry to run model's evaluation: + - compute validation metrics + + Args: + config_filepath (str): evaluation configuration .py file + backend (str): distributed backend: nccl, gloo, horovod or None to run without distributed config + with_clearml (bool): if True, uses ClearML as experiment tracking system + """ + assert torch.cuda.is_available(), torch.cuda.is_available() + assert torch.backends.cudnn.enabled + torch.backends.cudnn.benchmark = True + + config_filepath = Path(config_filepath) + assert config_filepath.exists(), f"File '{config_filepath.as_posix()}' is not found" + + with idist.Parallel(backend=backend) as parallel: + logger = setup_logger(name="ImageNet Evaluation", distributed_rank=idist.get_rank()) + + config = ConfigObject(config_filepath) + InferenceConfigSchema.validate(config) + config.script_filepath = Path(__file__) + + output_path = setup_experiment_tracking(config, with_clearml=with_clearml, task_type="testing") + config.output_path = output_path + + utils.log_basic_info(logger, get_params(config, InferenceConfigSchema)) + + try: + parallel.run(evaluation, config, logger=logger, with_clearml=with_clearml) + except KeyboardInterrupt: + logger.info("Catched KeyboardInterrupt -> exit") + except Exception as e: # noqa + logger.exception("") + raise e + + +if __name__ == "__main__": + + fire.Fire({"training": run_training, "eval": run_evaluation}) diff --git a/examples/references/classification/imagenet/requirements.txt b/examples/references/classification/imagenet/requirements.txt index d110af83c572..3d39aac0c05b 100644 --- a/examples/references/classification/imagenet/requirements.txt +++ b/examples/references/classification/imagenet/requirements.txt @@ -1,11 +1,12 @@ albumentations -image-dataset-viz numpy -opencv-python -py_config_runner +opencv-python-headless +fire pytorch-ignite -pillow tensorboard torch torchvision tqdm +clearml +image-dataset-viz +py_config_runner>=0.2.0,<1.0.0 \ No newline at end of file diff --git a/examples/references/classification/imagenet/utils.py b/examples/references/classification/imagenet/utils.py new file mode 100644 index 000000000000..799a6069afd4 --- /dev/null +++ b/examples/references/classification/imagenet/utils.py @@ -0,0 +1,59 @@ +import torch + +import ignite +import ignite.distributed as idist +from ignite.handlers import DiskSaver + + +def initialize(config): + + device = idist.device() + + model = config.model.to(device) + optimizer = config.optimizer + + # Adapt model to dist config + model = idist.auto_model(model) + optimizer = idist.auto_optim(optimizer) + criterion = config.criterion.to(device) + + return model, optimizer, criterion + + +def log_basic_info(logger, config): + logger.info(f"- PyTorch version: {torch.__version__}") + logger.info(f"- Ignite version: {ignite.__version__}") + if torch.cuda.is_available(): + # explicitly import cudnn as + # torch.backends.cudnn can not be pickled with hvd spawning procs + from torch.backends import cudnn + + logger.info(f"- GPU Device: {torch.cuda.get_device_name(idist.get_local_rank())}") + logger.info(f"- CUDA version: {torch.version.cuda}") + logger.info(f"- CUDNN version: {cudnn.version()}") + + logger.info("\n") + logger.info("Configuration:") + for key, value in config.items(): + logger.info(f"\t{key}: {value}") + logger.info("\n") + + if idist.get_world_size() > 1: + logger.info("\nDistributed setting:") + logger.info(f"\tbackend: {idist.backend()}") + logger.info(f"\tworld size: {idist.get_world_size()}") + logger.info("\n") + + +def log_metrics(logger, epoch, elapsed, tag, metrics): + metrics_output = "\n".join([f"\t{k}: {v}" for k, v in metrics.items()]) + logger.info(f"\nEpoch {epoch} - Evaluation time (seconds): {elapsed:.2f} - {tag} metrics:\n {metrics_output}") + + +def get_save_handler(output_path, with_clearml): + if with_clearml: + from ignite.contrib.handlers.clearml_logger import ClearMLSaver + + return ClearMLSaver(dirname=output_path) + + return DiskSaver(output_path) diff --git a/examples/references/classification/imagenet/code/dataflow/vis.py b/examples/references/classification/imagenet/vis.py similarity index 66% rename from examples/references/classification/imagenet/code/dataflow/vis.py rename to examples/references/classification/imagenet/vis.py index 9a1363b92720..68926afb5310 100644 --- a/examples/references/classification/imagenet/code/dataflow/vis.py +++ b/examples/references/classification/imagenet/vis.py @@ -67,3 +67,34 @@ def make_grid( out_image[0:h, i * w : (i + 1) * w, :] = render_datapoint(img, target, text_size=12) return out_image + + +def predictions_gt_images_handler(img_denormalize_fn, n_images=None, another_engine=None, prefix_tag=None): + def wrapper(engine, logger, event_name): + batch = engine.state.batch + output = engine.state.output + x, y = batch + y_pred = output[0] + + if y.shape == y_pred.shape and y.ndim == 4: + # Case of y of shape (B, C, H, W) + y = torch.argmax(y, dim=1) + + y_pred = torch.argmax(y_pred, dim=1).byte() + + if n_images is not None: + x = x[:n_images, ...] + y = y[:n_images, ...] + y_pred = y_pred[:n_images, ...] + + grid_pred_gt = make_grid(x, y_pred, img_denormalize_fn, batch_gt=y) + + state = engine.state if another_engine is None else another_engine.state + global_step = state.get_event_attrib_value(event_name) + + tag = "predictions_with_gt" + if prefix_tag is not None: + tag = f"{prefix_tag}: {tag}" + logger.writer.add_image(tag=tag, img_tensor=grid_pred_gt, global_step=global_step, dataformats="HWC") + + return wrapper diff --git a/examples/references/segmentation/pascal_voc2012/README.md b/examples/references/segmentation/pascal_voc2012/README.md index d96e81fffa17..011ead1f281c 100644 --- a/examples/references/segmentation/pascal_voc2012/README.md +++ b/examples/references/segmentation/pascal_voc2012/README.md @@ -8,7 +8,11 @@ Features: - Distributed training with native automatic mixed precision - Experiments tracking with [ClearML](https://github.com/allegroai/clearml) -ClearML Server: TODO: ADD THE LINK +Experiment | Model | Dataset | Val Avg IoU | ClearML Link +---|---|---|---|--- +configs/baseline_dplv3_resnet101.py | DeepLabV3 Resnet101 | VOC Only | 0.659161 | [link](https://app.clear.ml/projects/0e9a3a92d3134283b7d5572d516d60c5/experiments/a7254f084a9e47ca9380dfd739f89520/output/execution) +configs/baseline_dplv3_resnet101_sbd.py | DeepLabV3 Resnet101 | VOC+SBD | 0.6853087 | [link](https://app.clear.ml/projects/0e9a3a92d3134283b7d5572d516d60c5/experiments/dc4cee3377a74d19bc2d0e0e4d638c1f/output/execution) + ## Setup @@ -22,7 +26,7 @@ For docker users, you can use the following images to run the example: ```bash docker pull pytorchignite/vision:latest ``` -or +or ```bash docker pull pytorchignite/hvd-vision:latest ``` @@ -31,7 +35,7 @@ and install other requirements as suggested above ### Using Horovod as distributed framework -We do not add `horovod` as a requirement into `requirements.txt`. Please, install it manually following the official guides or +We do not add `horovod` as a requirement into `requirements.txt`. Please, install it manually following the official guides or use `pytorchignite/hvd-vision:latest` docker image. ### (Optional) Download Pascal VOC2012 and SDB datasets @@ -73,7 +77,7 @@ export SBD_DATASET_PATH=/path/to/SBD/ Run the following command: ```bash CUDA_VISIBLE_DEVICES=0 python -u main.py training configs/baseline_dplv3_resnet101_sbd.py -# or without SBD +# or without SBD # CUDA_VISIBLE_DEVICES=0 python -u main.py training configs/baseline_dplv3_resnet101.py ``` @@ -82,9 +86,9 @@ CUDA_VISIBLE_DEVICES=0 python -u main.py training configs/baseline_dplv3_resnet1 - Adjust total batch size for your GPUs in the configuration file: `configs/baseline_dplv3_resnet101_sbd.py` or `configs/baseline_dplv3_resnet101.py` ```bash -python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py training configs/baseline_dplv3_resnet101_sbd.py -# or without SBD -# python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py training configs/baseline_dplv3_resnet101.py +torchrun --nproc_per_node=2 main.py training configs/baseline_dplv3_resnet101_sbd.py +# or without SBD +# torchrun --nproc_per_node=2 main.py training configs/baseline_dplv3_resnet101.py ``` #### Using Horovod as distributed framework @@ -108,7 +112,7 @@ CUDA_VISIBLE_DEVICES=0 python -u main.py eval configs/eval_baseline_dplv3_resnet #### Multiple GPUs ```bash -python -u -m torch.distributed.launch --nproc_per_node=2 --use_env main.py eval configs/eval_baseline_dplv3_resnet101_sbd.py +torchrun --nproc_per_node=2 main.py eval configs/eval_baseline_dplv3_resnet101_sbd.py ``` #### Using Horovod as distributed framework @@ -120,5 +124,5 @@ horovodrun -np=2 python -u main.py eval configs/eval_baseline_dplv3_resnet101_sb ## Acknowledgements -Trainings were done using credits provided by AWS for open-source development via NumFOCUS +Trainings were done using credits provided by AWS for open-source development via NumFOCUS and using [trainml.ai](trainml.ai) platform. diff --git a/examples/references/segmentation/pascal_voc2012/dataflow.py b/examples/references/segmentation/pascal_voc2012/dataflow.py index 77a36e3c9809..b3b462f7c64a 100644 --- a/examples/references/segmentation/pascal_voc2012/dataflow.py +++ b/examples/references/segmentation/pascal_voc2012/dataflow.py @@ -116,8 +116,8 @@ def get_train_noval_sbdataset(root_path, return_meta=False): def get_dataloader(dataset, sampler=None, shuffle=False, limit_num_samples=None, **kwargs): if limit_num_samples is not None: - np.random.seed(limit_num_samples) - indices = np.random.permutation(len(dataset))[:limit_num_samples] + g = torch.Generator().manual_seed(limit_num_samples) + indices = torch.randperm(len(dataset), generator=g)[:limit_num_samples] dataset = Subset(dataset, indices) return idist.auto_dataloader(dataset, sampler=sampler, shuffle=(sampler is None) and shuffle, **kwargs) @@ -144,7 +144,8 @@ def get_train_val_loaders( train_ds = train_ds + sbd_train_ds if len(val_ds) < len(train_ds): - train_eval_indices = np.random.permutation(len(train_ds))[: len(val_ds)] + g = torch.Generator().manual_seed(len(train_ds)) + train_eval_indices = torch.randperm(len(train_ds), generator=g)[: len(val_ds)] train_eval_ds = Subset(train_ds, train_eval_indices) else: train_eval_ds = train_ds diff --git a/examples/references/segmentation/pascal_voc2012/main.py b/examples/references/segmentation/pascal_voc2012/main.py index 53729fcfb9a7..257b14dea031 100644 --- a/examples/references/segmentation/pascal_voc2012/main.py +++ b/examples/references/segmentation/pascal_voc2012/main.py @@ -138,7 +138,7 @@ def custom_event_filter(_, val_iteration): tb_logger.attach( evaluator, log_handler=vis.predictions_gt_images_handler( - img_denormalize_fn=img_denormalize, n_images=15, another_engine=trainer, prefix_tag="validation" + img_denormalize_fn=img_denormalize, n_images=8, another_engine=trainer, prefix_tag="validation" ), event_name=Events.ITERATION_COMPLETED(event_filter=custom_event_filter), ) @@ -162,14 +162,20 @@ def compute_and_log_cm(cm_metric, iteration): from clearml import Task clearml_logger = Task.current_task().get_logger() - clearml_logger.report_confusion_matrix( - title="Final Confusion Matrix", - series="cm-preds-gt", - matrix=cm, - iteration=iteration, - xlabels=data.VOCSegmentationOpencv.target_names, - ylabels=data.VOCSegmentationOpencv.target_names, - ) + + try: + clearml_logger.report_confusion_matrix( + title="Final Confusion Matrix", + matrix=cm, + iteration=iteration, + xlabels=data.VOCSegmentationOpencv.target_names, + ylabels=data.VOCSegmentationOpencv.target_names, + extra_layout=None, + ) + except NameError: + # Temporary clearml bug work-around: + # https://github.com/allegroai/clearml/pull/936 + pass def create_trainer(model, optimizer, criterion, train_sampler, config, logger, with_clearml): diff --git a/examples/references/segmentation/pascal_voc2012/utils.py b/examples/references/segmentation/pascal_voc2012/utils.py index 8fcb5060c01b..799a6069afd4 100644 --- a/examples/references/segmentation/pascal_voc2012/utils.py +++ b/examples/references/segmentation/pascal_voc2012/utils.py @@ -14,21 +14,7 @@ def initialize(config): # Adapt model to dist config model = idist.auto_model(model) - - if idist.backend() == "horovod": - accumulation_steps = config.get("accumulation_steps", 1) - # Can not use auto_optim with Horovod: https://github.com/horovod/horovod/issues/2670 - import horovod.torch as hvd - - optimizer = hvd.DistributedOptimizer( - optimizer, named_parameters=model.named_parameters(), backward_passes_per_step=accumulation_steps - ) - hvd.broadcast_optimizer_state(optimizer, root_rank=0) - if accumulation_steps > 1: - # disable manual grads accumulation as it is already done on optimizer's side - config.accumulation_steps = 1 - else: - optimizer = idist.auto_optim(optimizer) + optimizer = idist.auto_optim(optimizer) criterion = config.criterion.to(device) return model, optimizer, criterion diff --git a/examples/references/segmentation/pascal_voc2012/vis.py b/examples/references/segmentation/pascal_voc2012/vis.py index 0a30539df408..baf98a082f3d 100644 --- a/examples/references/segmentation/pascal_voc2012/vis.py +++ b/examples/references/segmentation/pascal_voc2012/vis.py @@ -121,11 +121,11 @@ def wrapper(engine, logger, event_name): grid_pred_gt = make_grid(x, y_pred, img_denormalize_fn, batch_gt_mask=y) state = engine.state if another_engine is None else another_engine.state - global_step = state.get_event_attrib_value(event_name) + global_step = state.epoch tag = "predictions_with_gt" if prefix_tag is not None: - tag = f"{prefix_tag}: {tag}" + tag = f"{prefix_tag}: {tag} - epoch={global_step}" logger.writer.add_image(tag=tag, img_tensor=grid_pred_gt, global_step=global_step, dataformats="HWC") return wrapper diff --git a/examples/siamese_network/README.md b/examples/siamese_network/README.md new file mode 100644 index 000000000000..86e5d86519fc --- /dev/null +++ b/examples/siamese_network/README.md @@ -0,0 +1,10 @@ +# Siamese Network example on MNIST dataset + +This example is ported over from [pytorch/examples/siamese_network](https://github.com/pytorch/examples/tree/main/siamese_network) + +Usage: + +``` +pip install -r requirements.txt +python siamese_network.py +``` diff --git a/examples/siamese_network/requirements.txt b/examples/siamese_network/requirements.txt new file mode 100644 index 000000000000..965bac10af9d --- /dev/null +++ b/examples/siamese_network/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +pytorch-ignite \ No newline at end of file diff --git a/examples/siamese_network/siamese_network.py b/examples/siamese_network/siamese_network.py new file mode 100644 index 000000000000..d0a1bfb7e3de --- /dev/null +++ b/examples/siamese_network/siamese_network.py @@ -0,0 +1,311 @@ +import argparse + +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import DataLoader, Dataset +from torchvision import datasets + +from ignite.contrib.handlers import ProgressBar +from ignite.engine import Engine, Events +from ignite.handlers.param_scheduler import LRScheduler +from ignite.metrics import Accuracy, RunningAverage +from ignite.utils import manual_seed + + +class SiameseNetwork(nn.Module): + # update Siamese Network implementation in accordance with the dataset + """ + Siamese network for image similarity estimation. + The network is composed of two identical networks, one for each input. + The output of each network is concatenated and passed to a linear layer. + The output of the linear layer passed through a sigmoid function. + `"FaceNet" `_ is a variant of the Siamese network. + This implementation varies from FaceNet as we use the `ResNet-18` model from + `"Deep Residual Learning for Image Recognition" ` + as our feature extractor. + In addition we use CIFAR10 dataset along with TripletMarginLoss + """ + + def __init__(self): + super(SiameseNetwork, self).__init__() + # get resnet model + self.resnet = torchvision.models.resnet34(weights=None) + fc_in_features = self.resnet.fc.in_features + + # changing the FC layer of resnet model to a linear layer + self.resnet.fc = nn.Identity() + + # add linear layers to compare between the features of the two images + self.fc = nn.Sequential( + nn.Linear(fc_in_features, 256), + nn.ReLU(inplace=True), + nn.Linear(256, 10), + nn.ReLU(inplace=True), + ) + + # initialise relu activation + self.relu = nn.ReLU() + + # initialize the weights + self.resnet.apply(self.init_weights) + self.fc.apply(self.init_weights) + + def init_weights(self, m): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + m.bias.data.fill_(0.01) + + def forward_once(self, x): + output = self.resnet(x) + output = output.view(output.size()[0], -1) + return output + + def forward(self, input1, input2, input3): + + # pass the input through resnet + output1 = self.forward_once(input1) + output2 = self.forward_once(input2) + output3 = self.forward_once(input3) + + # pass the output of resnet to sigmoid layer + output1 = self.fc(output1) + output2 = self.fc(output2) + output3 = self.fc(output3) + + return output1, output2, output3 + + +class MatcherDataset(Dataset): + # following class implements data downloading and handles preprocessing + def __init__(self, root, train, download=False): + super(MatcherDataset, self).__init__() + + # get CIFAR10 dataset + self.dataset = datasets.CIFAR10(root, train=train, download=download) + + # convert data from numpy array to Tensor + self.data = torch.from_numpy(self.dataset.data) + + # shift the dimensions of dataset to match the initial input layer dimensions + self.data = torch.movedim(self.data, (0, 1, 2, 3), (0, 2, 3, 1)) + + # convert targets list to torch Tensor + self.dataset.targets = torch.tensor(self.dataset.targets) + + self.group_examples() + + def group_examples(self): + """ + To ease the accessibility of data based on the class, we will use `group_examples` to group + examples based on class. The data classes have already been mapped to numeric values and + so are the target outputs for each training input + + Every key in `grouped_examples` corresponds to a class in CIFAR10 dataset. For every key in + `grouped_examples`, every value will conform to all of the indices for the CIFAR10 + dataset examples that correspond to that key. + """ + + # get the targets from CIFAR10 dataset + np_arr = np.array(self.dataset.targets) + + # group examples based on class + self.grouped_examples = {} + for i in range(0, 10): + self.grouped_examples[i] = np.where((np_arr == i))[0] + + def __len__(self): + return self.data.shape[0] + + def __getitem__(self, index): + """ + For every sample in the batch we select 3 images. First one is the anchor image + which is the image obtained from the current index. We also obtain the label of + anchor image. + + Now we select two random images, one belonging to the same class as that of the + anchor image (named as positive_image) and the other belonging to a different class + than that of the anchor image (named as negative_image). We return the anchor image, + positive image, negative image and anchor label. + """ + + # obtain the anchor image + anchor_image = self.data[index].float() + + # obtain the class label of the anchor image + anchor_label = self.dataset.targets[index] + anchor_label = int(anchor_label.item()) + + # find a label which is different from anchor_label + labels = list(range(0, 10)) + labels.remove(anchor_label) + neg_index = torch.randint(0, 9, (1,)).item() + neg_label = labels[neg_index] + + # get a random index from the range range of indices + random_index = torch.randint(0, len(self.grouped_examples[anchor_label]), (1,)).item() + + # get the index of image in actual data using the anchor label and random index + positive_index = self.grouped_examples[anchor_label][random_index] + + # choosing a random image using positive_index + positive_image = self.data[positive_index].float() + + # get a random index from the range range of indices + random_index = torch.randint(0, len(self.grouped_examples[neg_label]), (1,)).item() + + # get the index of image in actual data using the negative label and random index + negative_index = self.grouped_examples[neg_label][random_index] + + # choosing a random image using negative_index + negative_image = self.data[negative_index].float() + + return anchor_image, positive_image, negative_image, anchor_label + + +def pairwise_distance(input1, input2): + dist = input1 - input2 + dist = torch.pow(dist, 2) + return dist + + +def calculate_loss(input1, input2): + output = pairwise_distance(input1, input2) + loss = torch.sum(output, 1) + loss = torch.sqrt(loss) + return loss + + +def run(args, model, device, optimizer, train_loader, test_loader, lr_scheduler): + + # using Triplet Margin Loss + criterion = nn.TripletMarginLoss(p=2, margin=2.8) + + # define model training step + def train_step(engine, batch): + model.train() + anchor_image, positive_image, negative_image, anchor_label = batch + anchor_image = anchor_image.to(device) + positive_image, negative_image = positive_image.to(device), negative_image.to(device) + anchor_label = anchor_label.to(device) + optimizer.zero_grad() + anchor_out, positive_out, negative_out = model(anchor_image, positive_image, negative_image) + loss = criterion(anchor_out, positive_out, negative_out) + loss.backward() + optimizer.step() + return loss + + # define model testing step + def test_step(engine, batch): + model.eval() + with torch.no_grad(): + anchor_image, _, _, anchor_label = batch + anchor_image = anchor_image.to(device) + anchor_label = anchor_label.to(device) + other_image = [] + other_label = [] + y_true = [] + for i in range(anchor_image.shape[0]): + index = torch.randint(0, anchor_image.shape[0], (1,)).item() + img = anchor_image[index] + label = anchor_label[index] + other_image.append(img) + other_label.append(label) + if anchor_label[i] == other_label[i]: + y_true.append(1) + else: + y_true.append(0) + other = torch.stack(other_image) + other_label = torch.tensor(other_label) + other, other_label = other.to(device), other_label.to(device) + anchor_out, other_out, _ = model(anchor_image, other, other) + test_loss = calculate_loss(anchor_out, other_out) + y_pred = torch.where(test_loss < 3, 1, 0) + y_true = torch.tensor(y_true) + return [y_pred, y_true] + + # create engines for trainer and evaluator + trainer = Engine(train_step) + evaluator = Engine(test_step) + + # attach Running Average Loss metric to trainer and evaluator engines + RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") + Accuracy(output_transform=lambda x: x).attach(evaluator, "accuracy") + + # attach progress bar to trainer with loss + pbar1 = ProgressBar() + pbar1.attach(trainer, metric_names=["loss"]) + + # attach progress bar to evaluator + pbar2 = ProgressBar() + pbar2.attach(evaluator) + + # attach LR Scheduler to trainer engine + trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler) + + # event handler triggers evauator at end of every epoch + @trainer.on(Events.EPOCH_COMPLETED(every=args.log_interval)) + def test(engine): + state = evaluator.run(test_loader) + print(f'Test Accuracy: {state.metrics["accuracy"]}') + + # run the trainer + trainer.run(train_loader, max_epochs=args.epochs) + + +def main(): + # adds training defaults and support for terminal arguments + parser = argparse.ArgumentParser(description="PyTorch Siamese network Example") + parser.add_argument( + "--batch-size", type=int, default=256, metavar="N", help="input batch size for training (default: 64)" + ) + parser.add_argument( + "--test-batch-size", type=int, default=256, metavar="N", help="input batch size for testing (default: 1000)" + ) + parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 14)") + parser.add_argument("--lr", type=float, default=1.0, metavar="LR", help="learning rate (default: 1.0)") + parser.add_argument( + "--gamma", type=float, default=0.95, metavar="M", help="Learning rate step gamma (default: 0.7)" + ) + parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") + parser.add_argument("--no-mps", action="store_true", default=False, help="disables macOS GPU training") + parser.add_argument("--dry-run", action="store_true", default=False, help="quickly check a single pass") + parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") + parser.add_argument( + "--log-interval", + type=int, + default=1, + metavar="N", + help="how many batches to wait before logging training status", + ) + parser.add_argument("--save-model", action="store_true", default=False, help="For Saving the current Model") + parser.add_argument("--num-workers", default=4, help="number of processes generating parallel batches") + args = parser.parse_args() + + # set manual seed + manual_seed(args.seed) + + # set device + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + + # data loading + train_dataset = MatcherDataset("../data", train=True, download=True) + test_dataset = MatcherDataset("../data", train=False) + train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, num_workers=args.num_workers) + test_loader = DataLoader(test_dataset, batch_size=args.test_batch_size, num_workers=args.num_workers) + + # set model parameters + model = SiameseNetwork().to(device) + optimizer = optim.Adadelta(model.parameters(), lr=args.lr) + scheduler = StepLR(optimizer, step_size=15, gamma=args.gamma) + lr_scheduler = LRScheduler(scheduler) + + # call run function + run(args, model, device, optimizer, train_loader, test_loader, lr_scheduler) + + +if __name__ == "__main__": + main() diff --git a/examples/super_resolution/README.md b/examples/super_resolution/README.md new file mode 100644 index 000000000000..d874747dc1cd --- /dev/null +++ b/examples/super_resolution/README.md @@ -0,0 +1,51 @@ +# Super-Resolution using an efficient sub-pixel convolutional neural network + +ported from [pytorch-examples](https://github.com/pytorch/examples/tree/main/super_resolution) + +This example illustrates how to use the efficient sub-pixel convolution layer described in ["Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional Neural Network" - Shi et al.](https://arxiv.org/abs/1609.05158) for increasing spatial resolution within your network for tasks such as superresolution. + +``` +usage: main.py [-h] --upscale_factor UPSCALE_FACTOR [--crop_size CROPSIZE] [--batch_size BATCHSIZE] + [--test_batch_size TESTBATCHSIZE] [--n_epochs NEPOCHS] [--lr LR] + [--cuda] [--threads THREADS] [--seed SEED] [--debug] + +PyTorch Super Res Example + +optional arguments: + -h, --help show this help message and exit + --upscale_factor super resolution upscale factor + --crop_size cropped size of the images for training + --batch_size training batch size + --test_batch_size testing batch size + --n_epochs number of epochs to train for + --lr Learning Rate. Default=0.01 + --cuda use cuda + --mps enable GPU on macOS + --threads number of threads for data loader to use Default=4 + --seed random seed to use. Default=123 + --debug debug mode for testing +``` + +This example trains a super-resolution network on the [Caltech101 dataset](https://pytorch.org/vision/main/generated/torchvision.datasets.Caltech101.html). A snapshot of the model after every epoch with filename `model_epoch_.pth` + +## Example Usage: + +### Train + +`python main.py --upscale_factor 3 --crop_size 180 --batch_size 4 --test_batch_size 100 --n_epochs 30 --lr 0.001` + +### Super-Resolve + +`python super_resolve.py --input_image .jpg --model model_epoch_500.pth --output_filename out.png` + +### Super-resolve example on a Cifar-10 image + +#### Input Image +![Cifar input image](./images/input_cifar.png) + +#### Output Images +| Output image from Model | Output from bicubic sampling | +|-------------------------------|------------------------------------| +| ![Cifar output image](./images/out_cifar.png) | ![Cifar output from bicubic sampling](./images/bicubic_image_cifar.png)| + + diff --git a/examples/super_resolution/images/bicubic_image_cifar.png b/examples/super_resolution/images/bicubic_image_cifar.png new file mode 100644 index 000000000000..b5bd4d9cf1b4 Binary files /dev/null and b/examples/super_resolution/images/bicubic_image_cifar.png differ diff --git a/examples/super_resolution/images/input_cifar.png b/examples/super_resolution/images/input_cifar.png new file mode 100644 index 000000000000..217b7e67d385 Binary files /dev/null and b/examples/super_resolution/images/input_cifar.png differ diff --git a/examples/super_resolution/images/out_cifar.png b/examples/super_resolution/images/out_cifar.png new file mode 100644 index 000000000000..9517aae801e2 Binary files /dev/null and b/examples/super_resolution/images/out_cifar.png differ diff --git a/examples/super_resolution/main.py b/examples/super_resolution/main.py new file mode 100644 index 000000000000..816d1caea7f2 --- /dev/null +++ b/examples/super_resolution/main.py @@ -0,0 +1,150 @@ +import argparse + +import torch +import torch.nn as nn +import torch.optim as optim +import torchvision +from model import Net +from torch.utils.data import DataLoader +from torchvision.transforms.functional import center_crop, resize, to_tensor + +from ignite.contrib.handlers import ProgressBar + +from ignite.engine import Engine, Events +from ignite.handlers import BasicTimeProfiler +from ignite.metrics import PSNR + +# Training settings +parser = argparse.ArgumentParser(description="PyTorch Super Res Example") +parser.add_argument("--crop_size", type=int, default=256, help="cropped size of the images for training") +parser.add_argument("--upscale_factor", type=int, required=True, help="super resolution upscale factor") +parser.add_argument("--batch_size", type=int, default=64, help="training batch size") +parser.add_argument("--test_batch_size", type=int, default=10, help="testing batch size") +parser.add_argument("--n_epochs", type=int, default=2, help="number of epochs to train for") +parser.add_argument("--lr", type=float, default=0.01, help="Learning Rate. Default=0.01") +parser.add_argument("--cuda", action="store_true", help="use cuda?") +parser.add_argument("--mps", action="store_true", default=False, help="enables macOS GPU training") +parser.add_argument("--threads", type=int, default=4, help="number of threads for data loader to use") +parser.add_argument("--seed", type=int, default=123, help="random seed to use. Default=123") +parser.add_argument("--debug", action="store_true", help="use debug") + +opt = parser.parse_args() + +print(opt) + +if opt.cuda and not torch.cuda.is_available(): + raise Exception("No GPU found, please run without --cuda") +if not opt.mps and torch.backends.mps.is_available(): + raise Exception("Found mps device, please run with --mps to enable macOS GPU") + +torch.manual_seed(opt.seed) +use_mps = opt.mps and torch.backends.mps.is_available() + +if opt.cuda: + device = torch.device("cuda") +elif use_mps: + device = torch.device("mps") +else: + device = torch.device("cpu") + +print("===> Loading datasets") + + +class SRDataset(torch.utils.data.Dataset): + def __init__(self, dataset, scale_factor, crop_size=256): + self.dataset = dataset + self.scale_factor = scale_factor + self.crop_size = crop_size + + def __getitem__(self, index): + image, _ = self.dataset[index] + img = image.convert("YCbCr") + hr_image, _, _ = img.split() + hr_image = center_crop(hr_image, self.crop_size) + lr_image = hr_image.copy() + if self.scale_factor != 1: + size = self.crop_size // self.scale_factor + lr_image = resize(lr_image, [size, size]) + hr_image = to_tensor(hr_image) + lr_image = to_tensor(lr_image) + return lr_image, hr_image + + def __len__(self): + return len(self.dataset) + + +trainset = torchvision.datasets.Caltech101(root="./data", download=True) +testset = torchvision.datasets.Caltech101(root="./data", download=False) + +trainset_sr = SRDataset(trainset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size) +testset_sr = SRDataset(testset, scale_factor=opt.upscale_factor, crop_size=opt.crop_size) + +training_data_loader = DataLoader(dataset=trainset_sr, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True) +testing_data_loader = DataLoader(dataset=testset_sr, num_workers=opt.threads, batch_size=opt.test_batch_size) + +print("===> Building model") +model = Net(upscale_factor=opt.upscale_factor).to(device) +criterion = nn.MSELoss() + +optimizer = optim.Adam(model.parameters(), lr=opt.lr) + + +def train_step(engine, batch): + model.train() + input, target = batch[0].to(device), batch[1].to(device) + + optimizer.zero_grad() + loss = criterion(model(input), target) + loss.backward() + optimizer.step() + + return loss.item() + + +def validation_step(engine, batch): + model.eval() + with torch.no_grad(): + x, y = batch[0].to(device), batch[1].to(device) + y_pred = model(x) + + return y_pred, y + + +trainer = Engine(train_step) +evaluator = Engine(validation_step) +psnr = PSNR(data_range=1) +psnr.attach(evaluator, "psnr") +validate_every = 1 + +if opt.debug: + epoch_length = 10 + validate_epoch_length = 1 +else: + epoch_length = len(training_data_loader) + validate_epoch_length = len(testing_data_loader) + + +@trainer.on(Events.EPOCH_COMPLETED(every=validate_every)) +def log_validation(): + evaluator.run(testing_data_loader, epoch_length=validate_epoch_length) + metrics = evaluator.state.metrics + print(f"Epoch: {trainer.state.epoch}, Avg. PSNR: {metrics['psnr']} dB") + + +@trainer.on(Events.EPOCH_COMPLETED) +def checkpoint(): + model_out_path = "model_epoch_{}.pth".format(trainer.state.epoch) + torch.save(model, model_out_path) + print("Checkpoint saved to {}".format(model_out_path)) + + +# Attach basic profiler +basic_profiler = BasicTimeProfiler() +basic_profiler.attach(trainer) + +ProgressBar().attach(trainer, output_transform=lambda x: {"loss": x}) + +trainer.run(training_data_loader, opt.n_epochs, epoch_length=epoch_length) + +results = basic_profiler.get_results() +basic_profiler.print_results(results) diff --git a/examples/super_resolution/model.py b/examples/super_resolution/model.py new file mode 100644 index 000000000000..1f80c95d0643 --- /dev/null +++ b/examples/super_resolution/model.py @@ -0,0 +1,29 @@ +import torch.nn as nn +import torch.nn.init as init + + +class Net(nn.Module): + def __init__(self, upscale_factor): + super(Net, self).__init__() + + self.relu = nn.ReLU() + self.conv1 = nn.Conv2d(1, 64, (5, 5), (1, 1), (2, 2)) + self.conv2 = nn.Conv2d(64, 64, (3, 3), (1, 1), (1, 1)) + self.conv3 = nn.Conv2d(64, 32, (3, 3), (1, 1), (1, 1)) + self.conv4 = nn.Conv2d(32, upscale_factor ** 2, (3, 3), (1, 1), (1, 1)) + self.pixel_shuffle = nn.PixelShuffle(upscale_factor) + + self._initialize_weights() + + def forward(self, x): + x = self.relu(self.conv1(x)) + x = self.relu(self.conv2(x)) + x = self.relu(self.conv3(x)) + x = self.pixel_shuffle(self.conv4(x)) + return x + + def _initialize_weights(self): + init.orthogonal_(self.conv1.weight, init.calculate_gain("relu")) + init.orthogonal_(self.conv2.weight, init.calculate_gain("relu")) + init.orthogonal_(self.conv3.weight, init.calculate_gain("relu")) + init.orthogonal_(self.conv4.weight) diff --git a/examples/super_resolution/super_resolve.py b/examples/super_resolution/super_resolve.py new file mode 100644 index 000000000000..05c841037692 --- /dev/null +++ b/examples/super_resolution/super_resolve.py @@ -0,0 +1,41 @@ +import argparse + +import numpy as np +import torch +from PIL import Image +from torchvision.transforms.functional import to_tensor + +# Training settings +parser = argparse.ArgumentParser(description="PyTorch Super Res Example") +parser.add_argument("--input_image", type=str, required=True, help="input image to use") +parser.add_argument("--model", type=str, required=True, help="model file to use") +parser.add_argument("--output_filename", type=str, help="where to save the output image") +parser.add_argument("--cuda", action="store_true", help="use cuda") +opt = parser.parse_args() + +print(opt) +img = Image.open(opt.input_image).convert("YCbCr") +y, cb, cr = img.split() + +model = torch.load(opt.model) +input = to_tensor(y).view(1, -1, y.size[1], y.size[0]) + +if opt.cuda: + model = model.cuda() + input = input.cuda() + +model.eval() +with torch.no_grad(): + out = model(input) +out = out.cpu() +out_img_y = out[0].detach().numpy() +out_img_y *= 255.0 +out_img_y = out_img_y.clip(0, 255) +out_img_y = Image.fromarray(np.uint8(out_img_y[0]), mode="L") + +out_img_cb = cb.resize(out_img_y.size, Image.BICUBIC) +out_img_cr = cr.resize(out_img_y.size, Image.BICUBIC) +out_img = Image.merge("YCbCr", [out_img_y, out_img_cb, out_img_cr]).convert("RGB") + +out_img.save(opt.output_filename) +print("output image saved to ", opt.output_filename) diff --git a/ignite/__init__.py b/ignite/__init__.py index 0185adce1238..1eec926df03d 100644 --- a/ignite/__init__.py +++ b/ignite/__init__.py @@ -6,4 +6,4 @@ import ignite.metrics import ignite.utils -__version__ = "0.5.0" +__version__ = "0.4.12" diff --git a/ignite/contrib/engines/common.py b/ignite/contrib/engines/common.py index b22f52c18760..f75eb4ae120b 100644 --- a/ignite/contrib/engines/common.py +++ b/ignite/contrib/engines/common.py @@ -48,6 +48,7 @@ def setup_common_training_handlers( with_pbars: bool = True, with_pbar_on_iters: bool = True, log_every_iters: int = 100, + device: Optional[Union[str, torch.device]] = None, stop_on_nan: bool = True, clear_cuda_cache: bool = True, save_handler: Optional[Union[Callable, BaseSaveHandler]] = None, @@ -91,7 +92,10 @@ def setup_common_training_handlers( class to use to store ``to_save``. See :class:`~ignite.handlers.checkpoint.Checkpoint` for more details. Argument is mutually exclusive with ``output_path``. kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`. + device: deprecated argument, it will be removed in v0.5.0. """ + if device is not None: + warnings.warn("Argument device is unused and deprecated. It will be removed in v0.5.0") if idist.get_world_size() > 1: _setup_common_distrib_training_handlers( @@ -580,6 +584,7 @@ def gen_save_best_models_by_val_score( n_saved: int = 3, trainer: Optional[Engine] = None, tag: str = "val", + score_sign: float = 1.0, **kwargs: Any, ) -> Checkpoint: """Method adds a handler to ``evaluator`` to save ``n_saved`` of best models based on the metric @@ -602,6 +607,8 @@ def gen_save_best_models_by_val_score( n_saved: number of best models to store trainer: trainer engine to fetch the epoch when saving the best model. tag: score name prefix: `{tag}_{metric_name}`. By default, tag is "val". + score_sign: sign of the score: 1.0 or -1.0. For error-like metrics, e.g. smaller is better, + a negative score sign should be used (objects with larger score are retained). Default, 1.0. kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`. Returns: @@ -623,7 +630,7 @@ def gen_save_best_models_by_val_score( n_saved=n_saved, global_step_transform=global_step_transform, score_name=f"{tag}_{metric_name.lower()}", - score_function=Checkpoint.get_default_score_fn(metric_name), + score_function=get_default_score_fn(metric_name, score_sign=score_sign), **kwargs, ) evaluator.add_event_handler(Events.COMPLETED, best_model_handler) @@ -639,6 +646,7 @@ def save_best_model_by_val_score( n_saved: int = 3, trainer: Optional[Engine] = None, tag: str = "val", + score_sign: float = 1.0, **kwargs: Any, ) -> Checkpoint: """Method adds a handler to ``evaluator`` to save on a disk ``n_saved`` of best models based on the metric @@ -654,6 +662,9 @@ def save_best_model_by_val_score( n_saved: number of best models to store trainer: trainer engine to fetch the epoch when saving the best model. tag: score name prefix: `{tag}_{metric_name}`. By default, tag is "val". + score_sign: sign of the score: 1.0 or -1.0. For error-like metrics, e.g. smaller is better, + a negative score sign should be used (objects with larger score are retained). Default, 1.0. + kwargs: optional keyword args to be passed to construct :class:`~ignite.handlers.checkpoint.Checkpoint`. Returns: @@ -667,12 +678,17 @@ def save_best_model_by_val_score( n_saved=n_saved, trainer=trainer, tag=tag, + score_sign=score_sign, **kwargs, ) def add_early_stopping_by_val_score( - patience: int, evaluator: Engine, trainer: Engine, metric_name: str + patience: int, + evaluator: Engine, + trainer: Engine, + metric_name: str, + score_sign: float = 1.0, ) -> EarlyStopping: """Method setups early stopping handler based on the score (named by `metric_name`) provided by `evaluator`. Metric value should increase in order to keep training and not early stop. @@ -683,11 +699,15 @@ def add_early_stopping_by_val_score( trainer: trainer engine to stop the run if no improvement. metric_name: metric name to use for score evaluation. This metric should be present in `evaluator.state.metrics`. + score_sign: sign of the score: 1.0 or -1.0. For error-like metrics, e.g. smaller is better, + a negative score sign should be used (objects with larger score are retained). Default, 1.0. Returns: A :class:`~ignite.handlers.early_stopping.EarlyStopping` handler. """ - es_handler = EarlyStopping(patience=patience, score_function=get_default_score_fn(metric_name), trainer=trainer) + es_handler = EarlyStopping( + patience=patience, score_function=get_default_score_fn(metric_name, score_sign=score_sign), trainer=trainer + ) evaluator.add_event_handler(Events.COMPLETED, es_handler) return es_handler diff --git a/ignite/contrib/handlers/__init__.py b/ignite/contrib/handlers/__init__.py index 2db80fd2fd9a..0a6fe3edd5cd 100644 --- a/ignite/contrib/handlers/__init__.py +++ b/ignite/contrib/handlers/__init__.py @@ -1,4 +1,5 @@ from ignite.contrib.handlers.clearml_logger import ClearMLLogger +from ignite.contrib.handlers.custom_events import CustomPeriodicEvent from ignite.contrib.handlers.mlflow_logger import MLflowLogger from ignite.contrib.handlers.neptune_logger import NeptuneLogger from ignite.contrib.handlers.polyaxon_logger import PolyaxonLogger diff --git a/ignite/contrib/handlers/clearml_logger.py b/ignite/contrib/handlers/clearml_logger.py index 199b21cc4e79..1bfbb1abc00a 100644 --- a/ignite/contrib/handlers/clearml_logger.py +++ b/ignite/contrib/handlers/clearml_logger.py @@ -125,29 +125,16 @@ def __init__(self, **kwargs: Any): if self.bypass_mode(): warnings.warn("ClearMLSaver: running in bypass mode") - class _Stub(object): - def __call__(self, *_: Any, **__: Any) -> "_Stub": - return self - - def __getattr__(self, attr: str) -> "_Stub": - if attr in ("name", "id"): - return "" # type: ignore[return-value] - return self - - def __setattr__(self, attr: str, val: Any) -> None: - pass - - self._task = _Stub() - else: - # Try to retrieve current the ClearML Task before trying to create a new one - self._task = Task.current_task() - if self._task is None: - self._task = Task.init( - project_name=kwargs.get("project_name"), - task_name=kwargs.get("task_name"), - task_type=kwargs.get("task_type", Task.TaskTypes.training), - **experiment_kwargs, - ) + # Try to retrieve current the ClearML Task before trying to create a new one + self._task = Task.current_task() + + if self._task is None: + self._task = Task.init( + project_name=kwargs.get("project_name"), + task_name=kwargs.get("task_name"), + task_type=kwargs.get("task_type", Task.TaskTypes.training), + **experiment_kwargs, + ) self.clearml_logger = self._task.get_logger() @@ -156,13 +143,20 @@ def __setattr__(self, attr: str, val: Any) -> None: @classmethod def set_bypass_mode(cls, bypass: bool) -> None: """ - Will bypass all outside communication, and will drop all logs. + Set ``clearml.Task`` to offline mode. + Will bypass all outside communication, and will save all data and logs to a local session folder. Should only be used in "standalone mode", when there is no access to the *clearml-server*. Args: bypass: If ``True``, all outside communication is skipped. + Data and logs will be stored in a local session folder. + For more information, please refer to `ClearML docs + `_. """ + from clearml import Task + setattr(cls, "_bypass", bypass) + Task.set_offline(offline_mode=bypass) @classmethod def bypass_mode(cls) -> bool: @@ -172,12 +166,32 @@ def bypass_mode(cls) -> bool: Note: `GITHUB_ACTIONS` env will automatically set bypass_mode to ``True`` unless overridden specifically with ``ClearMLLogger.set_bypass_mode(False)``. + For more information, please refer to `ClearML docs + `_. Return: - If True, all outside communication is skipped. + If True, ``clearml.Task`` is on offline mode, and all outside communication is skipped. """ return getattr(cls, "_bypass", bool(os.environ.get("CI"))) + def __getattr__(self, attr: Any) -> Any: + """ + Calls the corresponding method of ``clearml.Logger``. + + Args: + attr: methods of the ``clearml.Logger`` class. + """ + return getattr(self.clearml_logger, attr) + + def get_task(self) -> Any: + """ + Returns the task context that the logger is reporting. + + Return: + Returns the current task, equivalent to ``clearml.Task.current_task()``. + """ + return self._task + def close(self) -> None: self.clearml_logger.flush() diff --git a/ignite/contrib/handlers/custom_events.py b/ignite/contrib/handlers/custom_events.py new file mode 100644 index 000000000000..2b3e7313de57 --- /dev/null +++ b/ignite/contrib/handlers/custom_events.py @@ -0,0 +1,125 @@ +import warnings + +from ignite.engine import EventEnum, Events, State + + +class CustomPeriodicEvent: + """DEPRECATED. Use filtered events instead. + Handler to define a custom periodic events as a number of elapsed iterations/epochs + for an engine. + + When custom periodic event is created and attached to an engine, the following events are fired: + 1) K iterations is specified: + - `Events.ITERATIONS__STARTED` + - `Events.ITERATIONS__COMPLETED` + + 1) K epochs is specified: + - `Events.EPOCHS__STARTED` + - `Events.EPOCHS__COMPLETED` + + + Examples: + + .. code-block:: python + + from ignite.engine import Engine, Events + from ignite.contrib.handlers import CustomPeriodicEvent + + # Let's define an event every 1000 iterations + cpe1 = CustomPeriodicEvent(n_iterations=1000) + cpe1.attach(trainer) + + # Let's define an event every 10 epochs + cpe2 = CustomPeriodicEvent(n_epochs=10) + cpe2.attach(trainer) + + @trainer.on(cpe1.Events.ITERATIONS_1000_COMPLETED) + def on_every_1000_iterations(engine): + # run a computation after 1000 iterations + # ... + print(engine.state.iterations_1000) + + @trainer.on(cpe2.Events.EPOCHS_10_STARTED) + def on_every_10_epochs(engine): + # run a computation every 10 epochs + # ... + print(engine.state.epochs_10) + + + Args: + n_iterations (int, optional): number iterations of the custom periodic event + n_epochs (int, optional): number iterations of the custom periodic event. Argument is optional, but only one, + either n_iterations or n_epochs should defined. + + """ + + def __init__(self, n_iterations=None, n_epochs=None): + + warnings.warn( + "CustomPeriodicEvent is deprecated since 0.4.0 and will be removed in 0.5.0. Use filtered events instead.", + DeprecationWarning, + ) + + if n_iterations is not None: + if not isinstance(n_iterations, int): + raise TypeError("Argument n_iterations should be an integer") + if n_iterations < 1: + raise ValueError("Argument n_iterations should be positive") + + if n_epochs is not None: + if not isinstance(n_epochs, int): + raise TypeError("Argument n_epochs should be an integer") + if n_epochs < 1: + raise ValueError("Argument n_epochs should be positive") + + if (n_iterations is None and n_epochs is None) or (n_iterations and n_epochs): + raise ValueError("Either n_iterations or n_epochs should be defined") + + if n_iterations: + prefix = "iterations" + self.state_attr = "iteration" + self.period = n_iterations + + if n_epochs: + prefix = "epochs" + self.state_attr = "epoch" + self.period = n_epochs + + self.custom_state_attr = "{}_{}".format(prefix, self.period) + event_name = "{}_{}".format(prefix.upper(), self.period) + setattr( + self, + "Events", + EventEnum("Events", " ".join(["{}_STARTED".format(event_name), "{}_COMPLETED".format(event_name)])), + ) + + # Update State.event_to_attr + for e in self.Events: + State.event_to_attr[e] = self.custom_state_attr + + # Create aliases + self._periodic_event_started = getattr(self.Events, "{}_STARTED".format(event_name)) + self._periodic_event_completed = getattr(self.Events, "{}_COMPLETED".format(event_name)) + + def _on_started(self, engine): + setattr(engine.state, self.custom_state_attr, 0) + + def _on_periodic_event_started(self, engine): + if getattr(engine.state, self.state_attr) % self.period == 1: + setattr(engine.state, self.custom_state_attr, getattr(engine.state, self.custom_state_attr) + 1) + engine.fire_event(self._periodic_event_started) + + def _on_periodic_event_completed(self, engine): + if getattr(engine.state, self.state_attr) % self.period == 0: + engine.fire_event(self._periodic_event_completed) + + def attach(self, engine): + engine.register_events(*self.Events) + + engine.add_event_handler(Events.STARTED, self._on_started) + engine.add_event_handler( + getattr(Events, "{}_STARTED".format(self.state_attr.upper())), self._on_periodic_event_started + ) + engine.add_event_handler( + getattr(Events, "{}_COMPLETED".format(self.state_attr.upper())), self._on_periodic_event_completed + ) diff --git a/ignite/contrib/handlers/neptune_logger.py b/ignite/contrib/handlers/neptune_logger.py index 01b2f94cf1e3..94e0c00a238e 100644 --- a/ignite/contrib/handlers/neptune_logger.py +++ b/ignite/contrib/handlers/neptune_logger.py @@ -1,11 +1,13 @@ """Neptune logger and its helper handlers.""" import tempfile +import warnings from typing import Any, Callable, List, Mapping, Optional, Union import torch from torch.optim import Optimizer import ignite.distributed as idist +from ignite import __version__ from ignite.contrib.handlers.base_logger import ( BaseLogger, BaseOptimizerParamsHandler, @@ -26,40 +28,26 @@ "global_step_from_engine", ] +_INTEGRATION_VERSION_KEY = "source_code/integrations/neptune-pytorch-ignite" + class NeptuneLogger(BaseLogger): """ - `Neptune `_ handler to log metrics, model/optimizer parameters, gradients during the training - and validation. It can also log model checkpoints to Neptune server. + `Neptune `_ handler to log metrics, model/optimizer parameters and gradients during training + and validation. It can also log model checkpoints to Neptune. .. code-block:: bash - pip install neptune-client + pip install neptune Args: - api_token: Required in online mode. Neptune API token, found on https://neptune.ai. - project_name: Required in online mode. Qualified name of a project in a form of - "namespace/project_name" for example "tom/minst-classification". - If None, the value of NEPTUNE_PROJECT environment variable will be taken. - You need to create the project in https://neptune.ai first. - offline_mode: Optional default False. If offline_mode=True no logs will be send to neptune. - Usually used for debug purposes. - experiment_name: Optional. Editable name of the experiment. - Name is displayed in the experiment’s Details (Metadata section) and in experiments view as a column. - upload_source_files: Optional. List of source files to be uploaded. - Must be list of str or single str. Uploaded sources are displayed in the experiment’s Source code tab. - If None is passed, Python file from which experiment was created will be uploaded. - Pass empty list (`[]`) to upload no files. Unix style pathname pattern expansion is supported. - For example, you can pass `*.py` to upload all python source files from the current directory. - For recursion lookup use `**/*.py` (for Python 3.5 and later). For more information see glob library. - params: Optional. Parameters of the experiment. After experiment creation params are read-only. - Parameters are displayed in the experiment’s Parameters section and each key-value pair can be - viewed in experiments view as a column. - properties: Optional default is `{}`. Properties of the experiment. - They are editable after experiment is created. Properties are displayed in the experiment’s Details and - each key-value pair can be viewed in experiments view as a column. - tags: Optional default `[]`. Must be list of str. Tags of the experiment. - Tags are displayed in the experiment’s Details and can be viewed in experiments view as a column. + api_token: Neptune API token, found on https://neptune.ai -> User menu -> "Get your API token". + If None, the value of the NEPTUNE_API_TOKEN environment variable is used. To keep your token + secure, you should set it to the environment variable rather than including it in your code. + project: Name of a Neptune project, in the form "workspace-name/project-name". + For example "tom/mnist-classification". + If None, the value of the NEPTUNE_PROJECT environment variable is used. + **kwargs: Other arguments to be passed to the `init_run()` function. Examples: .. code-block:: python @@ -67,26 +55,27 @@ class NeptuneLogger(BaseLogger): from ignite.contrib.handlers.neptune_logger import * # Create a logger - # We are using the api_token for the anonymous user neptuner but you can use your own. + # Note: We are using the API token for anonymous logging. You can pass your own token, or save it as an + # environment variable and leave out the api_token argument. npt_logger = NeptuneLogger( api_token="ANONYMOUS", - project_name="shared/pytorch-ignite-integration", - experiment_name="cnn-mnist", # Optional, - params={"max_epochs": 10}, # Optional, - tags=["pytorch-ignite","minst"] # Optional + project="common/pytorch-ignite-integration", + name="cnn-mnist", # Optional, + tags=["pytorch-ignite", "minst"], # Optional ) - # Attach the logger to the trainer to log training loss at each iteration + # Attach the logger to the trainer to log training loss at each iteration. npt_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED, tag="training", - output_transform=lambda loss: {'loss': loss} + output_transform=lambda loss: {"loss": loss}, ) - # Attach the logger to the evaluator on the training dataset and log NLL, Accuracy metrics after each epoch - # We setup `global_step_transform=global_step_from_engine(trainer)` to take the epoch + # Attach the logger to the evaluator on the training dataset and log NLL + # and accuracy metrics after each epoch. + # We set up `global_step_transform=global_step_from_engine(trainer)` to take the epoch # of the `trainer` instead of `train_evaluator`. npt_logger.attach_output_handler( train_evaluator, @@ -96,43 +85,46 @@ class NeptuneLogger(BaseLogger): global_step_transform=global_step_from_engine(trainer), ) - # Attach the logger to the evaluator on the validation dataset and log NLL, Accuracy metrics after - # each epoch. We setup `global_step_transform=global_step_from_engine(trainer)` to take the epoch of the + # Attach the logger to the evaluator on the validation dataset and log NLL and accuracy metrics after + # each epoch. We set up `global_step_transform=global_step_from_engine(trainer)` to take the epoch of the # `trainer` instead of `evaluator`. npt_logger.attach_output_handler( evaluator, event_name=Events.EPOCH_COMPLETED, tag="validation", metric_names=["nll", "accuracy"], - global_step_transform=global_step_from_engine(trainer)), + global_step_transform=global_step_from_engine(trainer), ) - # Attach the logger to the trainer to log optimizer's parameters, e.g. learning rate at each iteration + # Attach the logger to the trainer to log optimizer parameters, such as learning rate at each iteration. npt_logger.attach_opt_params_handler( trainer, event_name=Events.ITERATION_STARTED, optimizer=optimizer, - param_name='lr' # optional + param_name="lr", # optional ) - # Attach the logger to the trainer to log model's weights norm after each iteration + # Attach the logger to the trainer to log model's weights norm after each iteration. npt_logger.attach( trainer, event_name=Events.ITERATION_COMPLETED, - log_handler=WeightsScalarHandler(model) + log_handler=WeightsScalarHandler(model), ) - Explore an experiment with neptune tracking here: - https://ui.neptune.ai/o/shared/org/pytorch-ignite-integration/e/PYTOR1-18/charts - You can save model checkpoints to a Neptune server: + Explore runs with Neptune tracking here: + https://app.neptune.ai/o/common/org/pytorch-ignite-integration/ + + You can also save model checkpoints to a Neptune: .. code-block:: python from ignite.handlers import Checkpoint + def score_function(engine): return engine.state.metrics["accuracy"] + to_save = {"model": model} handler = Checkpoint( to_save, @@ -140,70 +132,65 @@ def score_function(engine): filename_prefix="best", score_function=score_function, score_name="validation_accuracy", - global_step_transform=global_step_from_engine(trainer) + global_step_transform=global_step_from_engine(trainer), ) validation_evaluator.add_event_handler(Events.COMPLETED, handler) - It is also possible to use the logger as context manager: + It is also possible to use the logger as a context manager: .. code-block:: python from ignite.contrib.handlers.neptune_logger import * - # We are using the api_token for the anonymous user neptuner but you can use your own. - - with NeptuneLogger(api_token="ANONYMOUS", - project_name="shared/pytorch-ignite-integration", - experiment_name="cnn-mnist", # Optional, - params={"max_epochs": 10}, # Optional, - tags=["pytorch-ignite","mnist"] # Optional - ) as npt_logger: - + with NeptuneLogger() as npt_logger: trainer = Engine(update_fn) # Attach the logger to the trainer to log training loss at each iteration npt_logger.attach_output_handler( trainer, event_name=Events.ITERATION_COMPLETED, tag="training", - output_transform=lambda loss: {"loss": loss} + output_transform=lambda loss: {"loss": loss}, ) """ def __getattr__(self, attr: Any) -> Any: + return getattr(self.experiment, attr) - import neptune + def __getitem__(self, key: str) -> Any: + return self.experiment[key] - return getattr(neptune, attr) + def __setitem__(self, key: str, val: Any) -> Any: + self.experiment[key] = val - def __init__(self, *args: Any, **kwargs: Any) -> None: + def __init__(self, api_token: Optional[str] = None, project: Optional[str] = None, **kwargs: Any) -> None: try: - import neptune + try: + # neptune-client<1.0.0 package structure + with warnings.catch_warnings(): + # ignore the deprecation warnings + warnings.simplefilter("ignore") + import neptune.new as neptune + except ImportError: + # neptune>=1.0.0 package structure + import neptune except ImportError: raise ModuleNotFoundError( - "This contrib module requires neptune-client to be installed. " - "You may install neptune with command: \n pip install neptune-client \n" - ) - - if kwargs.get("offline_mode", False): - self.mode = "offline" - neptune.init( - project_qualified_name="dry-run/project", - backend=neptune.OfflineBackend(), + "This contrib module requires the Neptune client library to be installed. " + "Install neptune with the command: \n pip install neptune \n" ) - else: - self.mode = "online" - neptune.init(api_token=kwargs.get("api_token"), project_qualified_name=kwargs.get("project_name")) - kwargs["name"] = kwargs.pop("experiment_name", None) - self._experiment_kwargs = { - k: v for k, v in kwargs.items() if k not in ["api_token", "project_name", "offline_mode"] - } + run = neptune.init_run( + api_token=api_token, + project=project, + **kwargs, + ) + run[_INTEGRATION_VERSION_KEY] = __version__ - self.experiment = neptune.create_experiment(**self._experiment_kwargs) + self.experiment = run def close(self) -> None: - self.stop() + self.experiment.stop() def _create_output_handler(self, *args: Any, **kwargs: Any) -> "OutputHandler": return OutputHandler(*args, **kwargs) @@ -213,7 +200,7 @@ def _create_opt_params_handler(self, *args: Any, **kwargs: Any) -> "OptimizerPar class OutputHandler(BaseOutputHandler): - """Helper handler to log engine's output and/or metrics + """Helper handler to log engine's output and/or metrics. Args: tag: common title for all produced plots. For example, "training" @@ -355,7 +342,7 @@ def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, ) for key, value in metrics.items(): - logger.log_metric(key, x=global_step, y=value) + logger[key].append(value, step=global_step) class OptimizerParamsHandler(BaseOptimizerParamsHandler): @@ -412,7 +399,7 @@ def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, } for k, v in params.items(): - logger.log_metric(k, x=global_step, y=v) + logger[k].append(v, step=global_step) class WeightsScalarHandler(BaseWeightsScalarHandler): @@ -515,11 +502,8 @@ def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, continue name = name.replace(".", "/") - logger.log_metric( - f"{tag_prefix}weights_{self.reduction.__name__}/{name}", - x=global_step, - y=self.reduction(p.data), - ) + key = f"{tag_prefix}weights_{self.reduction.__name__}/{name}" + logger[key].append(self.reduction(p.data), step=global_step) class GradsScalarHandler(BaseWeightsScalarHandler): @@ -622,9 +606,8 @@ def __call__(self, engine: Engine, logger: NeptuneLogger, event_name: Union[str, continue name = name.replace(".", "/") - logger.log_metric( - f"{tag_prefix}grads_{self.reduction.__name__}/{name}", x=global_step, y=self.reduction(p.grad) - ) + key = f"{tag_prefix}grads_{self.reduction.__name__}/{name}" + logger[key].append(self.reduction(p.grad), step=global_step) class NeptuneSaver(BaseSaveHandler): @@ -634,6 +617,10 @@ class NeptuneSaver(BaseSaveHandler): neptune_logger: an instance of NeptuneLogger class. + .. Note :: + + NeptuneSaver is currently not supported on Windows. + Examples: .. code-block:: python @@ -689,12 +676,30 @@ def __init__(self, neptune_logger: NeptuneLogger): def __call__(self, checkpoint: Mapping, filename: str, metadata: Optional[Mapping] = None) -> None: # wont work on XLA + # Imports for BC compatibility + try: + # neptune-client<1.0.0 package structure + with warnings.catch_warnings(): + # ignore the deprecation warnings + warnings.simplefilter("ignore") + from neptune.new.types import File + except ImportError: + # neptune>=1.0.0 package structure + from neptune.types import File + with tempfile.NamedTemporaryFile() as tmp: # we can not use tmp.name to open tmp.file twice on Win32 # https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile torch.save(checkpoint, tmp.file) - self._logger.log_artifact(tmp.name, filename) + + # rewind the buffer + tmp.file.seek(0) + + # hold onto the file stream for uploading. + # NOTE: This won't load the whole file in memory and upload + # the stream in smaller chunks. + self._logger[filename].upload(File.from_stream(tmp.file)) @idist.one_rank_only(with_barrier=True) def remove(self, filename: str) -> None: - self._logger.delete_artifacts(filename) + del self._logger.experiment[filename] diff --git a/ignite/distributed/utils.py b/ignite/distributed/utils.py index 88ddca2287c8..0d885b467917 100644 --- a/ignite/distributed/utils.py +++ b/ignite/distributed/utils.py @@ -336,7 +336,7 @@ def all_reduce( Returns: torch.Tensor or number - .. versionchanged:: 0.5.0 + .. versionchanged:: 0.4.11 added ``group`` """ if _need_to_sync and isinstance(_model, _SerialModel): @@ -362,7 +362,7 @@ def all_gather( torch.Tensor of shape ``(world_size, )`` if input is a number or List of strings if input is a string - .. versionchanged:: 0.5.0 + .. versionchanged:: 0.4.11 added ``group`` """ if _need_to_sync and isinstance(_model, _SerialModel): @@ -461,9 +461,7 @@ def new_group(ranks: List[int], **kwargs: Any) -> Any: ranks = [0, 1] group = idist.new_group(ranks) - .. versionadded:: 0.5.0 - ``backend`` now accepts `horovod` distributed framework. - + .. versionadded:: 0.4.11 """ if _need_to_sync and isinstance(_model, _SerialModel): diff --git a/ignite/engine/__init__.py b/ignite/engine/__init__.py index cb367fd3a4ea..299afadba9a2 100644 --- a/ignite/engine/__init__.py +++ b/ignite/engine/__init__.py @@ -524,7 +524,7 @@ def output_transform_fn(x, y, y_pred, loss): .. versionchanged:: 0.4.7 Added Gradient Accumulation argument for all supervised training methods. .. versionchanged:: 0.4.11 - Added `model_transform` to transform model's output + Added ``model_transform`` to transform model's output """ device_type = device.type if isinstance(device, torch.device) else device @@ -593,6 +593,7 @@ def supervised_evaluation_step( device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, + model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y), ) -> Callable: """ @@ -606,6 +607,8 @@ def supervised_evaluation_step( with respect to the host. For other cases, this argument has no effect. prepare_batch: function that receives `batch`, `device`, `non_blocking` and outputs tuple of tensors `(batch_x, batch_y)`. + model_transform: function that receives the output from the model and convert it into the predictions: + ``y_pred = model_transform(model(x))``. output_transform: function that receives 'x', 'y', 'y_pred' and returns value to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits output expected by metrics. If you change it you should use `output_transform` in metrics. @@ -624,13 +627,16 @@ def supervised_evaluation_step( The `model` should be moved by the user before creating an optimizer. .. versionadded:: 0.4.5 + .. versionchanged:: 0.4.12 + Added ``model_transform`` to transform model's output """ def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]: model.eval() with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) - y_pred = model(x) + output = model(x) + y_pred = model_transform(output) return output_transform(x, y, y_pred) return evaluate_step @@ -641,6 +647,7 @@ def supervised_evaluation_step_amp( device: Optional[Union[str, torch.device]] = None, non_blocking: bool = False, prepare_batch: Callable = _prepare_batch, + model_transform: Callable[[Any], Any] = lambda output: output, output_transform: Callable[[Any, Any, Any], Any] = lambda x, y, y_pred: (y_pred, y), ) -> Callable: """ @@ -654,6 +661,8 @@ def supervised_evaluation_step_amp( with respect to the host. For other cases, this argument has no effect. prepare_batch: function that receives `batch`, `device`, `non_blocking` and outputs tuple of tensors `(batch_x, batch_y)`. + model_transform: function that receives the output from the model and convert it into the predictions: + ``y_pred = model_transform(model(x))``. output_transform: function that receives 'x', 'y', 'y_pred' and returns value to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits output expected by metrics. If you change it you should use `output_transform` in metrics. @@ -672,6 +681,8 @@ def supervised_evaluation_step_amp( The `model` should be moved by the user before creating an optimizer. .. versionadded:: 0.4.5 + .. versionchanged:: 0.4.12 + Added ``model_transform`` to transform model's output """ try: from torch.cuda.amp import autocast @@ -683,7 +694,8 @@ def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, T with torch.no_grad(): x, y = prepare_batch(batch, device=device, non_blocking=non_blocking) with autocast(enabled=True): - y_pred = model(x) + output = model(x) + y_pred = model_transform(output) return output_transform(x, y, y_pred) return evaluate_step @@ -711,6 +723,8 @@ def create_supervised_evaluator( with respect to the host. For other cases, this argument has no effect. prepare_batch: function that receives `batch`, `device`, `non_blocking` and outputs tuple of tensors `(batch_x, batch_y)`. + model_transform: function that receives the output from the model and convert it into the predictions: + ``y_pred = model_transform(model(x))``. output_transform: function that receives 'x', 'y', 'y_pred' and returns value to be assigned to engine's state.output after each iteration. Default is returning `(y_pred, y,)` which fits output expected by metrics. If you change it you should use `output_transform` in metrics. @@ -737,7 +751,9 @@ def create_supervised_evaluator( - `PyTorch's Explanation `_ .. versionchanged:: 0.4.5 - - Added ``amp_mode`` argument for automatic mixed precision. + Added ``amp_mode`` argument for automatic mixed precision. + .. versionchanged:: 0.4.12 + Added ``model_transform`` to transform model's output """ device_type = device.type if isinstance(device, torch.device) else device on_tpu = "xla" in device_type if device_type is not None else False @@ -745,9 +761,23 @@ def create_supervised_evaluator( metrics = metrics or {} if mode == "amp": - evaluate_step = supervised_evaluation_step_amp(model, device, non_blocking, prepare_batch, output_transform) + evaluate_step = supervised_evaluation_step_amp( + model, + device, + non_blocking=non_blocking, + prepare_batch=prepare_batch, + model_transform=model_transform, + output_transform=output_transform, + ) else: - evaluate_step = supervised_evaluation_step(model, device, non_blocking, prepare_batch, output_transform) + evaluate_step = supervised_evaluation_step( + model, + device, + non_blocking=non_blocking, + prepare_batch=prepare_batch, + model_transform=model_transform, + output_transform=output_transform, + ) evaluator = Engine(evaluate_step) diff --git a/ignite/engine/engine.py b/ignite/engine/engine.py index 5e7e8c798217..89d17b863ee0 100644 --- a/ignite/engine/engine.py +++ b/ignite/engine/engine.py @@ -1,6 +1,5 @@ import functools import logging -import math import time import warnings import weakref @@ -643,7 +642,7 @@ def state_dict_user_keys(self) -> List: return self._state_dict_user_keys def state_dict(self) -> OrderedDict: - """Returns a dictionary containing engine's state: "epoch_length", "max_epochs" and "iteration" and + """Returns a dictionary containing engine's state: "seed", "epoch_length", "max_epochs" and "iteration" and other state values defined by `engine.state_dict_user_keys` .. code-block:: python @@ -676,11 +675,11 @@ def save_engine(_): def load_state_dict(self, state_dict: Mapping) -> None: """Setups engine from `state_dict`. - State dictionary should contain keys: `iteration` or `epoch`, `max_epochs` and `epoch_length`. - If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary. + State dictionary should contain keys: `iteration` or `epoch` and `max_epochs`, `epoch_length` and + `seed`. If `engine.state_dict_user_keys` contains keys, they should be also present in the state dictionary. Iteration and epoch values are 0-based: the first iteration or epoch is zero. - This method does not remove any custom attributes added by user. + This method does not remove any custom attributs added by user. Args: state_dict: a dict with parameters @@ -725,14 +724,13 @@ def load_state_dict(self, state_dict: Mapping) -> None: @staticmethod def _is_done(state: State) -> bool: - is_done_iters = state.max_iters is not None and state.iteration >= state.max_iters is_done_count = ( state.epoch_length is not None and state.max_epochs is not None and state.iteration >= state.epoch_length * state.max_epochs ) is_done_epochs = state.max_epochs is not None and state.epoch >= state.max_epochs - return is_done_iters or is_done_count or is_done_epochs + return is_done_count or is_done_epochs def set_data(self, data: Union[Iterable, DataLoader]) -> None: """Method to set data. After calling the method the next batch passed to `processing_function` is @@ -774,14 +772,14 @@ def run( self, data: Optional[Iterable] = None, max_epochs: Optional[int] = None, - max_iters: Optional[int] = None, epoch_length: Optional[int] = None, + seed: Optional[int] = None, ) -> State: """Runs the ``process_function`` over the passed data. Engine has a state and the following logic is applied in this function: - - At the first call, new state is defined by `max_epochs`, `max_iters`, `epoch_length`, if provided. + - At the first call, new state is defined by `max_epochs`, `epoch_length`, `seed`, if provided. A timer for total and per-epoch time is initialized when Events.STARTED is handled. - If state is already defined such that there are iterations to run until `max_epochs` and no input arguments provided, state is kept and used in the function. @@ -799,8 +797,7 @@ def run( `len(data)`. If `data` is an iterator and `epoch_length` is not set, then it will be automatically determined as the iteration on which data iterator raises `StopIteration`. This argument should not change if run is resuming from a state. - max_iters: Number of iterations to run for. - `max_iters` and `max_epochs` are mutually exclusive; only one of the two arguments should be provided. + seed: Deprecated argument. Please, use `torch.manual_seed` or :meth:`~ignite.utils.manual_seed`. Returns: State: output state. @@ -829,6 +826,12 @@ def switch_batch(engine): trainer.run(train_loader, max_epochs=2) """ + if seed is not None: + warnings.warn( + "Argument seed is deprecated. It will be removed in 0.5.0. " + "Please, use torch.manual_seed or ignite.utils.manual_seed" + ) + if data is not None and not isinstance(data, Iterable): raise TypeError("Argument data should be iterable") @@ -852,6 +855,8 @@ def switch_batch(engine): if self.state.max_epochs is None or (self._is_done(self.state) and self._internal_run_generator is None): # Create new state + if max_epochs is None: + max_epochs = 1 if epoch_length is None: if data is None: raise ValueError("epoch_length should be provided if data is None") @@ -860,22 +865,9 @@ def switch_batch(engine): if epoch_length is not None and epoch_length < 1: raise ValueError("Input data has zero size. Please provide non-empty data") - if max_iters is None: - if max_epochs is None: - max_epochs = 1 - else: - if max_epochs is not None: - raise ValueError( - "Arguments max_iters and max_epochs are mutually exclusive." - "Please provide only max_epochs or max_iters." - ) - if epoch_length is not None: - max_epochs = math.ceil(max_iters / epoch_length) - self.state.iteration = 0 self.state.epoch = 0 self.state.max_epochs = max_epochs - self.state.max_iters = max_iters self.state.epoch_length = epoch_length # Reset generator if previously used self._internal_run_generator = None @@ -1048,19 +1040,12 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: if self.state.epoch_length is None: # Define epoch length and stop the epoch self.state.epoch_length = iter_counter - if self.state.max_iters is not None: - self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length) break # Should exit while loop if we can not iterate if should_exit: - if not self._is_done(self.state): - total_iters = ( - self.state.epoch_length * self.state.max_epochs - if self.state.max_epochs is not None - else self.state.max_iters - ) - + if not self._is_done(self.state) and self.state.max_epochs is not None: + total_iters = self.state.epoch_length * self.state.max_epochs warnings.warn( "Data iterator can not provide data anymore but required total number of " "iterations to run is not reached. " @@ -1087,10 +1072,6 @@ def _run_once_on_dataset_as_gen(self) -> Generator[State, None, float]: if self.state.epoch_length is not None and iter_counter == self.state.epoch_length: break - if self.state.max_iters is not None and self.state.iteration == self.state.max_iters: - self.should_terminate = True - raise _EngineTerminateException() - except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) self.should_terminate_single_epoch = False @@ -1206,19 +1187,12 @@ def _run_once_on_dataset_legacy(self) -> float: if self.state.epoch_length is None: # Define epoch length and stop the epoch self.state.epoch_length = iter_counter - if self.state.max_iters is not None: - self.state.max_epochs = math.ceil(self.state.max_iters / self.state.epoch_length) break # Should exit while loop if we can not iterate if should_exit: - if not self._is_done(self.state): - total_iters = ( - self.state.epoch_length * self.state.max_epochs - if self.state.max_epochs is not None - else self.state.max_iters - ) - + if not self._is_done(self.state) and self.state.max_epochs is not None: + total_iters = self.state.epoch_length * self.state.max_epochs warnings.warn( "Data iterator can not provide data anymore but required total number of " "iterations to run is not reached. " @@ -1245,10 +1219,6 @@ def _run_once_on_dataset_legacy(self) -> float: if self.state.epoch_length is not None and iter_counter == self.state.epoch_length: break - if self.state.max_iters is not None and self.state.iteration == self.state.max_iters: - self.should_terminate = True - raise _EngineTerminateException() - except _EngineTerminateSingleEpochException: self._fire_event(Events.TERMINATE_SINGLE_EPOCH, iter_counter=iter_counter) self.should_terminate_single_epoch = False diff --git a/ignite/engine/events.py b/ignite/engine/events.py index a80277c525d3..217a0ddc0392 100644 --- a/ignite/engine/events.py +++ b/ignite/engine/events.py @@ -203,6 +203,17 @@ def __or__(self, other: Any) -> "EventsList": return EventsList() | self | other +class CallableEvents(CallableEventWithFilter): + # For backward compatibility + def __init__(self, *args: Any, **kwargs: Any) -> None: + super(CallableEvents, self).__init__(*args, **kwargs) + warnings.warn( + "Class ignite.engine.events.CallableEvents is deprecated. It will be removed in 0.5.0. " + "Please, use ignite.engine.EventEnum instead", + DeprecationWarning, + ) + + class EventEnum(CallableEventWithFilter, Enum): """Base class for all :class:`~ignite.engine.events.Events`. User defined custom events should also inherit this class. @@ -237,7 +248,10 @@ def function_before_backprop(engine): # ... """ - pass + def __new__(cls, value: str) -> "EventEnum": + obj = CallableEventWithFilter.__new__(cls) + obj._value_ = value + return obj class Events(EventEnum): @@ -440,7 +454,6 @@ class State: state.dataloader # data passed to engine state.epoch_length # optional length of an epoch state.max_epochs # number of epochs to run - state.max_iters # number of iterations to run state.batch # batch passed to `process_function` state.output # output of `process_function` after a single iteration state.metrics # dictionary with defined metrics if any @@ -467,7 +480,6 @@ def __init__(self, **kwargs: Any) -> None: self.epoch = 0 self.epoch_length: Optional[int] = None self.max_epochs: Optional[int] = None - self.max_iters: Optional[int] = None self.output: Optional[int] = None self.batch: Optional[int] = None self.metrics: Dict[str, Any] = {} diff --git a/ignite/handlers/checkpoint.py b/ignite/handlers/checkpoint.py index f508f0170220..16989b032c10 100644 --- a/ignite/handlers/checkpoint.py +++ b/ignite/handlers/checkpoint.py @@ -102,6 +102,7 @@ class Checkpoint(Serializable): Input of the function is ``(engine, event_name)``. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. + archived: Deprecated argument as models saved by ``torch.save`` are already compressed. filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render checkpoint filenames. If the pattern is not defined, the default pattern would be used. See Note for details. @@ -288,6 +289,7 @@ def __init__( score_name: Optional[str] = None, n_saved: Union[int, None] = 1, global_step_transform: Optional[Callable] = None, + archived: bool = False, filename_pattern: Optional[str] = None, include_self: bool = False, greater_or_equal: bool = False, @@ -320,6 +322,8 @@ def __init__( if global_step_transform is not None and not callable(global_step_transform): raise TypeError(f"global_step_transform should be a function, got {type(global_step_transform)} instead.") + if archived: + warnings.warn("Argument archived is deprecated and will be removed in 0.5.0") self.to_save = to_save self.filename_prefix = filename_prefix @@ -878,6 +882,11 @@ class ModelCheckpoint(Checkpoint): Behaviour of this class has been changed since v0.3.0. + Argument ``save_as_state_dict`` is deprecated and should not be used. It is considered as True. + + Argument ``save_interval`` is deprecated and should not be used. Please, use events filtering instead, e.g. + ``Events.ITERATION_STARTED(every=1000)``. + There is no more internal counter that has been used to indicate the number of save actions. User could see its value `step_number` in the filename, e.g. `{filename_prefix}_{name}_{step_number}.pt`. Actually, `step_number` is replaced by current engine's epoch if `score_function` is specified and current iteration @@ -906,6 +915,7 @@ class ModelCheckpoint(Checkpoint): Input of the function is `(engine, event_name)`. Output of function should be an integer. Default is None, global_step based on attached engine. If provided, uses function output as global_step. To setup global step from another engine, please use :meth:`~ignite.handlers.global_step_from_engine`. + archived: Deprecated argument as models saved by `torch.save` are already compressed. filename_pattern: If ``filename_pattern`` is provided, this pattern will be used to render checkpoint filenames. If the pattern is not defined, the default pattern would be used. See :class:`~ignite.handlers.checkpoint.Checkpoint` for details. @@ -952,19 +962,38 @@ def __init__( self, dirname: Union[str, Path], filename_prefix: str = "", + save_interval: Optional[int] = None, score_function: Optional[Callable] = None, score_name: Optional[str] = None, n_saved: Union[int, None] = 1, atomic: bool = True, require_empty: bool = True, create_dir: bool = True, + save_as_state_dict: bool = True, global_step_transform: Optional[Callable] = None, + archived: bool = False, filename_pattern: Optional[str] = None, include_self: bool = False, greater_or_equal: bool = False, save_on_rank: int = 0, **kwargs: Any, ): + if not save_as_state_dict: + raise ValueError( + "Argument save_as_state_dict is deprecated and should be True." + "This argument will be removed in 0.5.0." + ) + if save_interval is not None: + msg = ( + "Argument save_interval is deprecated and should be None. This argument will be removed in 0.5.0." + "Please, use events filtering instead, e.g. Events.ITERATION_STARTED(every=1000)" + ) + if save_interval == 1: + # Do not break for old version who used `save_interval=1` + warnings.warn(msg) + else: + # No choice + raise ValueError(msg) disk_saver = DiskSaver( dirname, @@ -984,6 +1013,7 @@ def __init__( n_saved=n_saved, global_step_transform=global_step_transform, filename_pattern=filename_pattern, + archived=archived, include_self=include_self, greater_or_equal=greater_or_equal, save_on_rank=save_on_rank, diff --git a/ignite/handlers/lr_finder.py b/ignite/handlers/lr_finder.py index 69c176e93da3..98bfeff0afb1 100644 --- a/ignite/handlers/lr_finder.py +++ b/ignite/handlers/lr_finder.py @@ -106,7 +106,6 @@ def _run( max_iter = trainer.state.epoch_length * trainer.state.max_epochs # type: ignore[operator] if max_iter < num_iter: max_iter = num_iter - trainer.state.max_iters = num_iter trainer.state.max_epochs = ceil(num_iter / trainer.state.epoch_length) # type: ignore[operator] if not trainer.has_event_handler(self._reached_num_iterations): diff --git a/ignite/handlers/param_scheduler.py b/ignite/handlers/param_scheduler.py index c4a989704e05..9ece11f106d5 100644 --- a/ignite/handlers/param_scheduler.py +++ b/ignite/handlers/param_scheduler.py @@ -1436,7 +1436,7 @@ def get_param(self) -> List[Union[float, List[float]]]: """ Method to get current `schedulers`' parameter values - .. versionadded:: 0.5.0 + .. versionadded:: 0.4.11 """ return [scheduler.get_param() for scheduler in self.schedulers] diff --git a/ignite/handlers/timing.py b/ignite/handlers/timing.py index 16c65dc05705..0e2e45c7257c 100644 --- a/ignite/handlers/timing.py +++ b/ignite/handlers/timing.py @@ -74,13 +74,16 @@ class Timer: from ignite.engine import Engine, Events from ignite.handlers import Timer + trainer = Engine(training_update_function) timer = Timer(average=True) - timer.attach(trainer, - start=Events.STARTED, - resume=Events.ITERATION_STARTED, - pause=Events.ITERATION_COMPLETED, - step=Events.ITERATION_COMPLETED) + timer.attach( + trainer, + start=Events.STARTED, + resume=Events.ITERATION_STARTED, + pause=Events.ITERATION_COMPLETED, + step=Events.ITERATION_COMPLETED + ) """ def __init__(self, average: bool = False): diff --git a/ignite/metrics/ssim.py b/ignite/metrics/ssim.py index 103dcb4e58f1..805024c0fd23 100644 --- a/ignite/metrics/ssim.py +++ b/ignite/metrics/ssim.py @@ -159,10 +159,10 @@ def update(self, output: Sequence[torch.Tensor]) -> None: y_pred = F.pad(y_pred, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect") y = F.pad(y, [self.pad_w, self.pad_w, self.pad_h, self.pad_h], mode="reflect") - input_list = torch.cat([y_pred, y, y_pred * y_pred, y * y, y_pred * y]) - outputs = F.conv2d(input_list, self._kernel, groups=channel) - - output_list = [outputs[x * y_pred.size(0) : (x + 1) * y_pred.size(0)] for x in range(len(outputs))] + input_list = [y_pred, y, y_pred * y_pred, y * y, y_pred * y] + outputs = F.conv2d(torch.cat(input_list), self._kernel, groups=channel) + batch_size = y_pred.size(0) + output_list = [outputs[x * batch_size : (x + 1) * batch_size] for x in range(len(input_list))] mu_pred_sq = output_list[0].pow(2) mu_target_sq = output_list[1].pow(2) diff --git a/mypy.ini b/mypy.ini index 489b3a3fd28c..bf91c5787738 100644 --- a/mypy.ini +++ b/mypy.ini @@ -77,3 +77,6 @@ ignore_missing_imports = True [mypy-torchvision.*] ignore_missing_imports = True + +[mypy-ignite.contrib.handlers.custom_events] +ignore_errors = True diff --git a/requirements-dev.txt b/requirements-dev.txt index 6c57963f41a2..182a4057bc17 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,10 +1,9 @@ # Tests numpy pytest -codecov pytest-cov pytest-xdist -dill==0.3.4 +dill setuptools # Test contrib dependencies scipy @@ -21,7 +20,7 @@ polyaxon polyaxon-client wandb mlflow -neptune-client==0.16.9 +neptune-client>=0.16.17 tensorboard torchvision pynvml diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index c008d7b28ad7..d25cb33caceb 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -48,7 +48,6 @@ def _test_setup_common_training_handlers( save_handler=None, output_transform=lambda loss: loss, ): - lr = 0.01 step_size = 100 gamma = 0.5 @@ -165,6 +164,9 @@ def test_asserts_setup_common_training_handlers(): ) trainer.run([1]) + with pytest.warns(UserWarning, match=r"Argument device is unused and deprecated"): + setup_common_training_handlers(trainer, device="cpu") + def test_no_warning_with_train_sampler(recwarn): from torch.utils.data import RandomSampler @@ -218,7 +220,6 @@ def test_setup_common_training_handlers(dirname, capsys): def test_setup_common_training_handlers_using_save_handler(dirname, capsys): - save_handler = DiskSaver(dirname=dirname, require_empty=False) _test_setup_common_training_handlers(dirname=None, device="cpu", save_handler=save_handler) @@ -231,20 +232,25 @@ def test_setup_common_training_handlers_using_save_handler(dirname, capsys): def test_save_best_model_by_val_score(dirname): + acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.5, 0.6, 0.61, 0.7, 0.5] - trainer = Engine(lambda e, b: None) - evaluator = Engine(lambda e, b: None) - model = DummyModel() + def setup_trainer(): + trainer = Engine(lambda e, b: None) + evaluator = Engine(lambda e, b: None) + model = DummyModel() - acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.5, 0.6, 0.61, 0.7, 0.5] + @trainer.on(Events.EPOCH_COMPLETED) + def validate(engine): + evaluator.run([0, 1]) - @trainer.on(Events.EPOCH_COMPLETED) - def validate(engine): - evaluator.run([0, 1]) + @evaluator.on(Events.EPOCH_COMPLETED) + def set_eval_metric(engine): + acc = acc_scores[trainer.state.epoch - 1] + engine.state.metrics = {"acc": acc, "loss": 1 - acc} - @evaluator.on(Events.EPOCH_COMPLETED) - def set_eval_metric(engine): - engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]} + return trainer, evaluator, model + + trainer, evaluator, model = setup_trainer() save_best_model_by_val_score(dirname, evaluator, model, metric_name="acc", n_saved=2, trainer=trainer) @@ -252,22 +258,42 @@ def set_eval_metric(engine): assert set(os.listdir(dirname)) == {"best_model_8_val_acc=0.6100.pt", "best_model_9_val_acc=0.7000.pt"} + for fname in os.listdir(dirname): + os.unlink(f"{dirname}/{fname}") -def test_gen_save_best_models_by_val_score(): + trainer, evaluator, model = setup_trainer() - trainer = Engine(lambda e, b: None) - evaluator = Engine(lambda e, b: None) - model = DummyModel() + save_best_model_by_val_score( + dirname, evaluator, model, metric_name="loss", n_saved=2, trainer=trainer, score_sign=-1.0 + ) + + trainer.run([0, 1], max_epochs=len(acc_scores)) + + assert set(os.listdir(dirname)) == {"best_model_8_val_loss=-0.3900.pt", "best_model_9_val_loss=-0.3000.pt"} + +def test_gen_save_best_models_by_val_score(): acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.5, 0.6, 0.61, 0.7, 0.5] + loss_scores = [0.9, 0.8, 0.7, 0.6, 0.7, 0.5, 0.4, 0.39, 0.3, 0.5] + + def setup_trainer(): + trainer = Engine(lambda e, b: None) + evaluator = Engine(lambda e, b: None) + model = DummyModel() + + @trainer.on(Events.EPOCH_COMPLETED) + def validate(engine): + evaluator.run([0, 1]) + + @evaluator.on(Events.EPOCH_COMPLETED) + def set_eval_metric(engine): + acc = acc_scores[trainer.state.epoch - 1] + loss = loss_scores[trainer.state.epoch - 1] + engine.state.metrics = {"acc": acc, "loss": loss} - @trainer.on(Events.EPOCH_COMPLETED) - def validate(engine): - evaluator.run([0, 1]) + return trainer, evaluator, model - @evaluator.on(Events.EPOCH_COMPLETED) - def set_eval_metric(engine): - engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]} + trainer, evaluator, model = setup_trainer() save_handler = MagicMock() @@ -291,20 +317,56 @@ def set_eval_metric(engine): any_order=True, ) + trainer, evaluator, model = setup_trainer() -def test_add_early_stopping_by_val_score(): - trainer = Engine(lambda e, b: None) - evaluator = Engine(lambda e, b: None) + save_handler = MagicMock() + + gen_save_best_models_by_val_score( + save_handler, + evaluator, + {"a": model, "b": model}, + metric_name="loss", + n_saved=2, + trainer=trainer, + score_sign=-1.0, + ) + + trainer.run([0, 1], max_epochs=len(acc_scores)) + assert save_handler.call_count == len(acc_scores) - 2 # 2 score values (-0.7 and -0.5) are not the best + obj_to_save = {"a": model.state_dict(), "b": model.state_dict()} + save_handler.assert_has_calls( + [ + call( + obj_to_save, + f"best_checkpoint_{e}_val_loss={p:.4f}.pt", + dict([("basename", "best_checkpoint"), ("score_name", "val_loss"), ("priority", p)]), + ) + for e, p in zip([1, 2, 3, 4, 6, 7, 8, 9], [-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.39, -0.3]) + ], + any_order=True, + ) + + +def test_add_early_stopping_by_val_score(): acc_scores = [0.1, 0.2, 0.3, 0.4, 0.3, 0.3, 0.2, 0.1, 0.1, 0.0] - @trainer.on(Events.EPOCH_COMPLETED) - def validate(engine): - evaluator.run([0, 1]) + def setup_trainer(): + trainer = Engine(lambda e, b: None) + evaluator = Engine(lambda e, b: None) + + @trainer.on(Events.EPOCH_COMPLETED) + def validate(engine): + evaluator.run([0, 1]) - @evaluator.on(Events.EPOCH_COMPLETED) - def set_eval_metric(engine): - engine.state.metrics = {"acc": acc_scores[trainer.state.epoch - 1]} + @evaluator.on(Events.EPOCH_COMPLETED) + def set_eval_metric(engine): + acc = acc_scores[trainer.state.epoch - 1] + engine.state.metrics = {"acc": acc, "loss": 1 - acc} + + return trainer, evaluator + + trainer, evaluator = setup_trainer() add_early_stopping_by_val_score(patience=3, evaluator=evaluator, trainer=trainer, metric_name="acc") @@ -312,15 +374,23 @@ def set_eval_metric(engine): assert state.epoch == 7 + trainer, evaluator = setup_trainer() -def test_deprecated_setup_any_logging(): + add_early_stopping_by_val_score( + patience=3, evaluator=evaluator, trainer=trainer, metric_name="loss", score_sign=-1.0 + ) + + state = trainer.run([0, 1], max_epochs=len(acc_scores)) + + assert state.epoch == 7 + +def test_deprecated_setup_any_logging(): with pytest.raises(DeprecationWarning, match=r"deprecated since version 0.4.0"): setup_any_logging(None, None, None, None, None, None) def test__setup_logging_wrong_args(): - with pytest.raises(TypeError, match=r"Argument optimizers should be either a single optimizer or"): _setup_logging(MagicMock(), MagicMock(), "abc", MagicMock(), 1) @@ -406,7 +476,6 @@ def set_eval_metric(engine): def test_setup_tb_logging(dirname): - tb_logger = _test_setup_logging( setup_logging_fn=setup_tb_logging, kwargs_dict={"output_path": dirname / "t1"}, @@ -462,7 +531,6 @@ def test_setup_visdom_logging(visdom_offline_logfile): def test_setup_plx_logging(): - os.environ["POLYAXON_NO_OP"] = "1" _test_setup_logging( @@ -506,7 +574,6 @@ def test_setup_mlflow_logging(dirname): def test_setup_wandb_logging(dirname): - from unittest.mock import patch with patch("ignite.contrib.engines.common.WandBLogger") as _: @@ -514,7 +581,6 @@ def test_setup_wandb_logging(dirname): def test_setup_clearml_logging(): - handlers.clearml_logger.ClearMLLogger.set_bypass_mode(True) with pytest.warns(UserWarning, match=r"running in bypass mode"): @@ -561,7 +627,7 @@ def test_setup_clearml_logging(): def test_setup_neptune_logging(dirname): npt_logger = _test_setup_logging( setup_logging_fn=setup_neptune_logging, - kwargs_dict={"offline_mode": True}, + kwargs_dict={"mode": "offline"}, output_handler_cls=handlers.neptune_logger.OutputHandler, opt_params_handler_cls=handlers.neptune_logger.OptimizerParamsHandler, with_eval=False, @@ -570,7 +636,7 @@ def test_setup_neptune_logging(dirname): npt_logger.close() npt_logger = _test_setup_logging( setup_logging_fn=setup_neptune_logging, - kwargs_dict={"offline_mode": True}, + kwargs_dict={"mode": "offline"}, output_handler_cls=handlers.neptune_logger.OutputHandler, opt_params_handler_cls=handlers.neptune_logger.OptimizerParamsHandler, with_eval=True, @@ -583,7 +649,6 @@ def test_setup_neptune_logging(dirname): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl): - local_rank = distributed_context_single_node_nccl["local_rank"] device = idist.device() _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) @@ -593,7 +658,6 @@ def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): - device = idist.device() local_rank = distributed_context_single_node_gloo["local_rank"] _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) @@ -610,7 +674,6 @@ def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gloo_cpu_or_gpu(dirname, distributed_context_multi_node_gloo): - device = idist.device() rank = distributed_context_multi_node_gloo["rank"] _test_setup_common_training_handlers(dirname, device, rank=rank) @@ -621,7 +684,6 @@ def test_multinode_distrib_gloo_cpu_or_gpu(dirname, distributed_context_multi_no @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_nccl_gpu(dirname, distributed_context_multi_node_nccl): - local_rank = distributed_context_multi_node_nccl["local_rank"] rank = distributed_context_multi_node_nccl["rank"] device = idist.device() diff --git a/tests/ignite/contrib/handlers/test_base_logger.py b/tests/ignite/contrib/handlers/test_base_logger.py index 8ec6b832f26b..15ad0a003af9 100644 --- a/tests/ignite/contrib/handlers/test_base_logger.py +++ b/tests/ignite/contrib/handlers/test_base_logger.py @@ -1,9 +1,11 @@ +import math from typing import Any, Union from unittest.mock import call, MagicMock import pytest import torch +from ignite.contrib.handlers import CustomPeriodicEvent from ignite.contrib.handlers.base_logger import ( BaseLogger, BaseOptimizerParamsHandler, @@ -259,6 +261,33 @@ def update_fn(engine, batch): mock_log_handler.assert_called_with(trainer, logger, event) assert mock_log_handler.call_count == n_calls + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_iterations = 10 + cpe1 = CustomPeriodicEvent(n_iterations=n_iterations) + n = len(data) * n_epochs / n_iterations + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe1.Events.ITERATIONS_10_STARTED, ns, cpe1) + _test(cpe1.Events.ITERATIONS_10_COMPLETED, nf, cpe1) + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_iterations = 15 + cpe2 = CustomPeriodicEvent(n_iterations=n_iterations) + n = len(data) * n_epochs / n_iterations + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe2.Events.ITERATIONS_15_STARTED, ns, cpe2) + _test(cpe2.Events.ITERATIONS_15_COMPLETED, nf, cpe2) + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + n_custom_epochs = 2 + cpe3 = CustomPeriodicEvent(n_epochs=n_custom_epochs) + n = n_epochs / n_custom_epochs + nf = math.floor(n) + ns = nf + 1 if nf < n else nf + _test(cpe3.Events.EPOCHS_2_STARTED, ns, cpe3) + _test(cpe3.Events.EPOCHS_2_COMPLETED, nf, cpe3) + @pytest.mark.parametrize( "event, n_calls", diff --git a/tests/ignite/contrib/handlers/test_clearml_logger.py b/tests/ignite/contrib/handlers/test_clearml_logger.py index b0e3ab65d079..2e4968f40fa5 100644 --- a/tests/ignite/contrib/handlers/test_clearml_logger.py +++ b/tests/ignite/contrib/handlers/test_clearml_logger.py @@ -1,7 +1,7 @@ import math import os from collections import defaultdict -from unittest.mock import ANY, call, MagicMock, Mock, patch +from unittest.mock import ANY, call, MagicMock, patch import clearml import pytest @@ -718,7 +718,8 @@ def update_fn(engine, batch): def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) - logger.clearml_logger.report_scalar(title="", series="", value="test_value", iteration=global_step) + test_value = 0.3 # example + logger.clearml_logger.report_scalar(title="", series="", value=test_value, iteration=global_step) logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) @@ -745,19 +746,54 @@ def update_fn(engine, batch): def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) - logger.clearml_logger.report_scalar(title="", series="", value="test_value", iteration=global_step) + test_value = 0.3 # example + logger.clearml_logger.report_scalar(title="", series="", value=test_value, iteration=global_step) clearml_logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) trainer.run(data, max_epochs=n_epochs) +def test_clearml_logger_getattr_method(dirname): + + with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"): + ClearMLLogger.set_bypass_mode(True) + + logger = ClearMLLogger(output_uri=dirname) + + # Create a mock clearml.Logger() object + mock_logger = MagicMock() + logger.clearml_logger = mock_logger + + # Test a method called by __getattr__ calls the corresponding method of the mock project. + logger.report_single_value("accuracy", 0.72) + mock_logger.report_single_value.assert_called_once_with("accuracy", 0.72) + + # Test a method called by __getattr__ calls the corresponding classmethod of the mock project's class. + logger.current_logger() + mock_logger.current_logger.assert_called_once() + + logger.close() + + +def test_clearml_logger_get_task_bypass(dirname): + + with pytest.warns(UserWarning, match="ClearMLSaver: running in bypass mode"): + ClearMLLogger.set_bypass_mode(True) + + with ClearMLLogger(output_uri=dirname) as clearml_logger: + task = clearml_logger.get_task() + assert isinstance(task, clearml.Task) + assert task == clearml.Task.current_task() + task.close() + + def test_clearml_disk_saver_integration(): model = torch.nn.Module() to_save_serializable = {"model": model} with pytest.warns(UserWarning, match="ClearMLSaver created a temporary checkpoints directory"): mock_logger = MagicMock(spec=ClearMLLogger) - clearml.Task.current_task = Mock(return_value=object()) + clearml.Task.current_task = MagicMock(spec=clearml.Task) clearml_saver = ClearMLSaver(mock_logger) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock() @@ -781,7 +817,7 @@ def test_clearml_disk_saver_integration_no_logger(): to_save_serializable = {"model": model} with pytest.warns(UserWarning, match="ClearMLSaver created a temporary checkpoints directory"): - clearml.Task.current_task = Mock(return_value=object()) + clearml.Task.current_task = MagicMock(spec=clearml.Task) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock() clearml_saver = ClearMLSaver() checkpoint = Checkpoint(to_save=to_save_serializable, save_handler=clearml_saver, n_saved=1) @@ -893,7 +929,7 @@ def forward(self, x): def _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=False): if idist.get_rank() == 0: - clearml.Task.current_task = Mock(return_value=object()) + clearml.Task.current_task = MagicMock(spec=clearml.Task) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock() torch.manual_seed(23) diff --git a/tests/ignite/contrib/handlers/test_custom_events.py b/tests/ignite/contrib/handlers/test_custom_events.py new file mode 100644 index 000000000000..9686c707141d --- /dev/null +++ b/tests/ignite/contrib/handlers/test_custom_events.py @@ -0,0 +1,133 @@ +import math + +import pytest + +from ignite.contrib.handlers.custom_events import CustomPeriodicEvent +from ignite.engine import Engine + + +def test_bad_input(): + + with pytest.warns(DeprecationWarning, match=r"CustomPeriodicEvent is deprecated"): + with pytest.raises(TypeError, match="Argument n_iterations should be an integer"): + CustomPeriodicEvent(n_iterations="a") + with pytest.raises(ValueError, match="Argument n_iterations should be positive"): + CustomPeriodicEvent(n_iterations=0) + with pytest.raises(TypeError, match="Argument n_iterations should be an integer"): + CustomPeriodicEvent(n_iterations=10.0) + with pytest.raises(TypeError, match="Argument n_epochs should be an integer"): + CustomPeriodicEvent(n_epochs="a") + with pytest.raises(ValueError, match="Argument n_epochs should be positive"): + CustomPeriodicEvent(n_epochs=0) + with pytest.raises(TypeError, match="Argument n_epochs should be an integer"): + CustomPeriodicEvent(n_epochs=10.0) + with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"): + CustomPeriodicEvent() + with pytest.raises(ValueError, match="Either n_iterations or n_epochs should be defined"): + CustomPeriodicEvent(n_iterations=1, n_epochs=2) + + +def test_new_events(): + def update(*args, **kwargs): + pass + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + engine = Engine(update) + cpe = CustomPeriodicEvent(n_iterations=5) + cpe.attach(engine) + + assert hasattr(cpe, "Events") + assert hasattr(cpe.Events, "ITERATIONS_5_STARTED") + assert hasattr(cpe.Events, "ITERATIONS_5_COMPLETED") + + assert engine._allowed_events[-2] == getattr(cpe.Events, "ITERATIONS_5_STARTED") + assert engine._allowed_events[-1] == getattr(cpe.Events, "ITERATIONS_5_COMPLETED") + + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_epochs=5) + cpe.attach(engine) + + assert hasattr(cpe, "Events") + assert hasattr(cpe.Events, "EPOCHS_5_STARTED") + assert hasattr(cpe.Events, "EPOCHS_5_COMPLETED") + + assert engine._allowed_events[-2] == getattr(cpe.Events, "EPOCHS_5_STARTED") + assert engine._allowed_events[-1] == getattr(cpe.Events, "EPOCHS_5_COMPLETED") + + +def test_integration_iterations(): + def _test(n_iterations, max_epochs, n_iters_per_epoch): + def update(*args, **kwargs): + pass + + engine = Engine(update) + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_iterations=n_iterations) + cpe.attach(engine) + data = list(range(n_iters_per_epoch)) + + custom_period = [0] + n_calls_iter_started = [0] + n_calls_iter_completed = [0] + + event_started = getattr(cpe.Events, "ITERATIONS_{}_STARTED".format(n_iterations)) + + @engine.on(event_started) + def on_my_event_started(engine): + assert (engine.state.iteration - 1) % n_iterations == 0 + custom_period[0] += 1 + custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations)) + assert custom_iter == custom_period[0] + n_calls_iter_started[0] += 1 + + event_completed = getattr(cpe.Events, "ITERATIONS_{}_COMPLETED".format(n_iterations)) + + @engine.on(event_completed) + def on_my_event_ended(engine): + assert engine.state.iteration % n_iterations == 0 + custom_iter = getattr(engine.state, "iterations_{}".format(n_iterations)) + assert custom_iter == custom_period[0] + n_calls_iter_completed[0] += 1 + + engine.run(data, max_epochs=max_epochs) + + n = len(data) * max_epochs / n_iterations + nf = math.floor(n) + assert custom_period[0] == n_calls_iter_started[0] + assert n_calls_iter_started[0] == nf + 1 if nf < n else nf + assert n_calls_iter_completed[0] == nf + + _test(3, 5, 16) + _test(4, 5, 16) + _test(5, 5, 16) + _test(300, 50, 1000) + + +def test_integration_epochs(): + def update(*args, **kwargs): + pass + + engine = Engine(update) + + n_epochs = 3 + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_epochs=n_epochs) + cpe.attach(engine) + data = list(range(16)) + + custom_period = [1] + + @engine.on(cpe.Events.EPOCHS_3_STARTED) + def on_my_epoch_started(engine): + assert (engine.state.epoch - 1) % n_epochs == 0 + assert engine.state.epochs_3 == custom_period[0] + + @engine.on(cpe.Events.EPOCHS_3_COMPLETED) + def on_my_epoch_ended(engine): + assert engine.state.epoch % n_epochs == 0 + assert engine.state.epochs_3 == custom_period[0] + custom_period[0] += 1 + + engine.run(data, max_epochs=10) + + assert custom_period[0] == 4 diff --git a/tests/ignite/contrib/handlers/test_neptune_logger.py b/tests/ignite/contrib/handlers/test_neptune_logger.py index bba1b60ca0d5..4a428b14eff0 100644 --- a/tests/ignite/contrib/handlers/test_neptune_logger.py +++ b/tests/ignite/contrib/handlers/test_neptune_logger.py @@ -1,11 +1,10 @@ import math import warnings -from unittest.mock import ANY, call, MagicMock +from unittest.mock import MagicMock import pytest import torch -import ignite.distributed as idist from ignite.contrib.handlers.neptune_logger import ( global_step_from_engine, GradsScalarHandler, @@ -16,7 +15,16 @@ WeightsScalarHandler, ) from ignite.engine import Engine, Events, State -from ignite.handlers.checkpoint import Checkpoint + + +def assert_logger_called_once_with(logger, key, value): + result = logger[key].fetch_values() + assert len(result.value) == 1 + + if isinstance(result.value[0], float): + assert math.isclose(result.value[0], value, abs_tol=0.01) + else: + assert result.value[0] == value def test_optimizer_params_handler_wrong_setup(): @@ -35,21 +43,28 @@ def test_optimizer_params_handler_wrong_setup(): def test_optimizer_params(): optimizer = torch.optim.SGD([torch.tensor(0.0)], lr=0.01) wrapper = OptimizerParamsHandler(optimizer=optimizer, param_name="lr") - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) + mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.iteration = 123 - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - mock_logger.log_metric.assert_called_once_with("lr/group_0", y=0.01, x=123) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + assert_logger_called_once_with(logger, "lr/group_0", 0.01) + logger.stop() wrapper = OptimizerParamsHandler(optimizer, param_name="lr", tag="generator") - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - mock_logger.log_metric.assert_called_once_with("generator/lr/group_0", y=0.01, x=123) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + assert_logger_called_once_with(logger, "generator/lr/group_0", 0.01) + logger.stop() def test_output_handler_with_wrong_logger_type(): @@ -63,120 +78,128 @@ def test_output_handler_with_wrong_logger_type(): def test_output_handler_output_transform(): wrapper = OutputHandler("tag", output_transform=lambda x: x) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.output = 12345 mock_engine.state.iteration = 123 - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - mock_logger.log_metric.assert_called_once_with("tag/output", y=12345, x=123) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + assert_logger_called_once_with(logger, "tag/output", 12345) + logger.stop() wrapper = OutputHandler("another_tag", output_transform=lambda x: {"loss": x}) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - mock_logger.log_metric.assert_called_once_with("another_tag/loss", y=12345, x=123) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + assert_logger_called_once_with(logger, "another_tag/loss", 12345) + logger.stop() def test_output_handler_metric_names(): wrapper = OutputHandler("tag", metric_names=["a", "b"]) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State(metrics={"a": 12.23, "b": 23.45}) mock_engine.state.iteration = 5 - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) - assert mock_logger.log_metric.call_count == 2 - mock_logger.log_metric.assert_has_calls([call("tag/a", y=12.23, x=5), call("tag/b", y=23.45, x=5)], any_order=True) + assert_logger_called_once_with(logger, "tag/a", 12.23) + assert_logger_called_once_with(logger, "tag/b", 23.45) + logger.stop() wrapper = OutputHandler("tag", metric_names=["a"]) mock_engine = MagicMock() - mock_logger.log_metric = MagicMock() mock_engine.state = State(metrics={"a": torch.tensor([0.0, 1.0, 2.0, 3.0])}) mock_engine.state.iteration = 5 - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - - assert mock_logger.log_metric.call_count == 4 - mock_logger.log_metric.assert_has_calls( - [ - call("tag/a/0", y=0.0, x=5), - call("tag/a/1", y=1.0, x=5), - call("tag/a/2", y=2.0, x=5), - call("tag/a/3", y=3.0, x=5), - ], - any_order=True, + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", ) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + + for key, val in [("tag/a/0", 0.0), ("tag/a/1", 1.0), ("tag/a/2", 2.0), ("tag/a/3", 3.0)]: + assert_logger_called_once_with(logger, key, val) + logger.stop() wrapper = OutputHandler("tag", metric_names=["a", "c"]) mock_engine = MagicMock() - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine.state = State(metrics={"a": 55.56, "c": "Some text"}) mock_engine.state.iteration = 7 - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() - with pytest.warns(UserWarning): - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) - assert mock_logger.log_metric.call_count == 1 - mock_logger.log_metric.assert_has_calls([call("tag/a", y=55.56, x=7)], any_order=True) + assert_logger_called_once_with(logger, "tag/a", 55.56) + logger.stop() # all metrics wrapper = OutputHandler("tag", metric_names="all") - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State(metrics={"a": 12.23, "b": 23.45}) mock_engine.state.iteration = 5 - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) - assert mock_logger.log_metric.call_count == 2 - mock_logger.log_metric.assert_has_calls([call("tag/a", y=12.23, x=5), call("tag/b", y=23.45, x=5)], any_order=True) + assert_logger_called_once_with(logger, "tag/a", 12.23) + assert_logger_called_once_with(logger, "tag/b", 23.45) + logger.stop() # log a torch tensor (ndimension = 0) wrapper = OutputHandler("tag", metric_names="all") - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State(metrics={"a": torch.tensor(12.23), "b": torch.tensor(23.45)}) mock_engine.state.iteration = 5 - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) - assert mock_logger.log_metric.call_count == 2 - mock_logger.log_metric.assert_has_calls( - [call("tag/a", y=torch.tensor(12.23).item(), x=5), call("tag/b", y=torch.tensor(23.45).item(), x=5)], - any_order=True, - ) + assert_logger_called_once_with(logger, "tag/a", 12.23) + assert_logger_called_once_with(logger, "tag/b", 23.45) + logger.stop() def test_output_handler_both(): wrapper = OutputHandler("tag", metric_names=["a", "b"], output_transform=lambda x: {"loss": x}) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State(metrics={"a": 12.23, "b": 23.45}) mock_engine.state.epoch = 5 mock_engine.state.output = 12345 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) - assert mock_logger.log_metric.call_count == 3 - mock_logger.log_metric.assert_has_calls( - [call("tag/a", y=12.23, x=5), call("tag/b", y=23.45, x=5), call("tag/loss", y=12345, x=5)], any_order=True - ) + assert_logger_called_once_with(logger, "tag/a", 12.23) + assert_logger_called_once_with(logger, "tag/b", 23.45) + assert_logger_called_once_with(logger, "tag/loss", 12345) + logger.stop() def test_output_handler_with_wrong_global_step_transform_output(): @@ -184,14 +207,19 @@ def global_step_transform(*args, **kwargs): return "a" wrapper = OutputHandler("tag", output_transform=lambda x: {"loss": x}, global_step_transform=global_step_transform) - mock_logger = MagicMock(spec=NeptuneLogger) + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 mock_engine.state.output = 12345 with pytest.raises(TypeError, match="global_step must be int"): - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) + + logger.stop() def test_output_handler_with_global_step_from_engine(): @@ -206,27 +234,28 @@ def test_output_handler_with_global_step_from_engine(): global_step_transform=global_step_from_engine(mock_another_engine), ) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 1 mock_engine.state.output = 0.123 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) - assert mock_logger.log_metric.call_count == 1 - mock_logger.log_metric.assert_has_calls( - [call("tag/loss", y=mock_engine.state.output, x=mock_another_engine.state.epoch)] - ) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) + assert_logger_called_once_with(logger, "tag/loss", mock_engine.state.output) mock_another_engine.state.epoch = 11 mock_engine.state.output = 1.123 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) - assert mock_logger.log_metric.call_count == 2 - mock_logger.log_metric.assert_has_calls( - [call("tag/loss", y=mock_engine.state.output, x=mock_another_engine.state.epoch)] - ) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) + + result = logger["tag/loss"].fetch_values() + assert len(result.value) == 2 + assert result.value[1] == mock_engine.state.output + + logger.stop() def test_output_handler_with_global_step_transform(): @@ -234,22 +263,27 @@ def global_step_transform(*args, **kwargs): return 10 wrapper = OutputHandler("tag", output_transform=lambda x: {"loss": x}, global_step_transform=global_step_transform) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 mock_engine.state.output = 12345 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) - assert mock_logger.log_metric.call_count == 1 - mock_logger.log_metric.assert_has_calls([call("tag/loss", y=12345, x=10)]) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) + assert_logger_called_once_with(logger, "tag/loss", 12345) + + logger.stop() def test_output_handler_state_attrs(): wrapper = OutputHandler("tag", state_attributes=["alpha", "beta", "gamma"]) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() @@ -258,18 +292,14 @@ def test_output_handler_state_attrs(): mock_engine.state.beta = torch.tensor(12.23) mock_engine.state.gamma = torch.tensor([21.0, 6.0]) - wrapper(mock_engine, mock_logger, Events.ITERATION_STARTED) - - assert mock_logger.log_metric.call_count == 4 - mock_logger.log_metric.assert_has_calls( - [ - call("tag/alpha", y=3.899, x=5), - call("tag/beta", y=torch.tensor(12.23).item(), x=5), - call("tag/gamma/0", y=21.0, x=5), - call("tag/gamma/1", y=6.0, x=5), - ], - any_order=True, - ) + wrapper(mock_engine, logger, Events.ITERATION_STARTED) + + assert_logger_called_once_with(logger, "tag/alpha", 3.899) + assert_logger_called_once_with(logger, "tag/beta", 12.23) + assert_logger_called_once_with(logger, "tag/gamma/0", 21.0) + assert_logger_called_once_with(logger, "tag/gamma/1", 6.0) + + logger.stop() def test_weights_scalar_handler_wrong_setup(): @@ -296,26 +326,24 @@ def test_weights_scalar_handler(dummy_model_factory): # define test wrapper to test with and without optional tag def _test(tag=None): wrapper = WeightsScalarHandler(model, tag=tag) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) tag_prefix = f"{tag}/" if tag else "" - assert mock_logger.log_metric.call_count == 4 - mock_logger.log_metric.assert_has_calls( - [ - call(tag_prefix + "weights_norm/fc1/weight", y=0.0, x=5), - call(tag_prefix + "weights_norm/fc1/bias", y=0.0, x=5), - call(tag_prefix + "weights_norm/fc2/weight", y=12.0, x=5), - call(tag_prefix + "weights_norm/fc2/bias", y=math.sqrt(12.0), x=5), - ], - any_order=True, - ) + assert_logger_called_once_with(logger, tag_prefix + "weights_norm/fc1/weight", 0.0) + assert_logger_called_once_with(logger, tag_prefix + "weights_norm/fc1/bias", 0.0) + assert_logger_called_once_with(logger, tag_prefix + "weights_norm/fc2/weight", 12.0) + assert_logger_called_once_with(logger, tag_prefix + "weights_norm/fc2/bias", math.sqrt(12.0)) + + logger.stop() _test() _test(tag="tag") @@ -325,26 +353,23 @@ def test_weights_scalar_handler_frozen_layers(dummy_model_factory): model = dummy_model_factory(with_grads=True, with_frozen_layer=True) wrapper = WeightsScalarHandler(model) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) - mock_logger.log_metric.assert_has_calls( - [call("weights_norm/fc2/weight", y=12.0, x=5), call("weights_norm/fc2/bias", y=math.sqrt(12.0), x=5)], - any_order=True, - ) + assert_logger_called_once_with(logger, "weights_norm/fc2/weight", 12.0) + assert_logger_called_once_with(logger, "weights_norm/fc2/bias", math.sqrt(12.0)) - with pytest.raises(AssertionError): - mock_logger.log_metric.assert_has_calls( - [call("weights_norm/fc1/weight", y=12.0, x=5), call("weights_norm/fc1/bias", y=math.sqrt(12.0), x=5)], - any_order=True, - ) + assert not logger.exists("weights_norm/fc1/weight") + assert not logger.exists("weights_norm/fc1/bias") - assert mock_logger.log_metric.call_count == 2 + logger.stop() def test_grads_scalar_handler_wrong_setup(): @@ -368,28 +393,25 @@ def test_grads_scalar_handler(dummy_model_factory, norm_mock): # define test wrapper to test with and without optional tag def _test(tag=None): wrapper = GradsScalarHandler(model, reduction=norm_mock, tag=tag) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 norm_mock.reset_mock() - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) tag_prefix = f"{tag}/" if tag else "" - mock_logger.log_metric.assert_has_calls( - [ - call(tag_prefix + "grads_norm/fc1/weight", y=ANY, x=5), - call(tag_prefix + "grads_norm/fc1/bias", y=ANY, x=5), - call(tag_prefix + "grads_norm/fc2/weight", y=ANY, x=5), - call(tag_prefix + "grads_norm/fc2/bias", y=ANY, x=5), - ], - any_order=True, - ) - assert mock_logger.log_metric.call_count == 4 - assert norm_mock.call_count == 4 + assert logger.exists(tag_prefix + "grads_norm/fc1/weight") + assert logger.exists(tag_prefix + "grads_norm/fc1/bias") + assert logger.exists(tag_prefix + "grads_norm/fc2/weight") + assert logger.exists(tag_prefix + "grads_norm/fc2/bias") + + logger.stop() _test() _test(tag="tag") @@ -399,25 +421,24 @@ def test_grads_scalar_handler_frozen_layers(dummy_model_factory, norm_mock): model = dummy_model_factory(with_grads=True, with_frozen_layer=True) wrapper = GradsScalarHandler(model, reduction=norm_mock) - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_metric = MagicMock() + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) mock_engine = MagicMock() mock_engine.state = State() mock_engine.state.epoch = 5 norm_mock.reset_mock() - wrapper(mock_engine, mock_logger, Events.EPOCH_STARTED) + wrapper(mock_engine, logger, Events.EPOCH_STARTED) - mock_logger.log_metric.assert_has_calls( - [call("grads_norm/fc2/weight", y=ANY, x=5), call("grads_norm/fc2/bias", y=ANY, x=5)], any_order=True - ) + assert logger.exists("grads_norm/fc2/weight") + assert logger.exists("grads_norm/fc2/bias") - with pytest.raises(AssertionError): - mock_logger.log_metric.assert_has_calls( - [call("grads_norm/fc1/weight", y=ANY, x=5), call("grads_norm/fc1/bias", y=ANY, x=5)], any_order=True - ) - assert mock_logger.log_metric.call_count == 2 - assert norm_mock.call_count == 2 + assert not logger.exists("grads_norm/fc1/weight") + assert not logger.exists("grads_norm/fc1/bias") + + logger.stop() def test_integration(): @@ -432,11 +453,11 @@ def update_fn(engine, batch): trainer = Engine(update_fn) - npt_logger = NeptuneLogger(offline_mode=True) + npt_logger = NeptuneLogger(mode="offline") def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) - logger.log_metric("test_value", global_step, global_step) + logger["test_value"].append(global_step, step=global_step) npt_logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) @@ -454,12 +475,12 @@ def test_integration_as_context_manager(): def update_fn(engine, batch): return next(losses_iter) - with NeptuneLogger(offline_mode=True) as npt_logger: + with NeptuneLogger(mode="offline") as npt_logger: trainer = Engine(update_fn) def dummy_handler(engine, logger, event_name): global_step = engine.state.get_event_attrib_value(event_name) - logger.log_metric("test_value", global_step, global_step) + logger["test_value"].append(global_step, step=global_step) npt_logger.attach(trainer, log_handler=dummy_handler, event_name=Events.EPOCH_COMPLETED) @@ -469,7 +490,7 @@ def dummy_handler(engine, logger, event_name): def test_neptune_saver_serializable(dirname): mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_artifact = MagicMock() + mock_logger.upload = MagicMock() model = torch.nn.Module() to_save_serializable = {"model": model} @@ -477,44 +498,16 @@ def test_neptune_saver_serializable(dirname): fname = dirname / "test.pt" saver(to_save_serializable, fname) - assert mock_logger.log_artifact.call_count == 1 - - -def _test_neptune_saver_integration(device): - - model = torch.nn.Module().to(device) - to_save_serializable = {"model": model} - - mock_logger = None - if idist.get_rank() == 0: - mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_artifact = MagicMock() - mock_logger.delete_artifacts = MagicMock() - - saver = NeptuneSaver(mock_logger) - - checkpoint = Checkpoint(to_save=to_save_serializable, save_handler=saver, n_saved=1) - - trainer = Engine(lambda e, b: None) - trainer.state = State(epoch=0, iteration=0) - checkpoint(trainer) - trainer.state.iteration = 1 - checkpoint(trainer) - if idist.get_rank() == 0: - assert mock_logger.log_artifact.call_count == 2 - assert mock_logger.delete_artifacts.call_count == 1 + assert mock_logger[dirname / "test.pt"].upload.call_count == 1 -def test_neptune_saver_integration(): - _test_neptune_saver_integration("cpu") - - -def test_neptune_saver_non_serializable(): +@pytest.mark.parametrize("model, serializable", [(lambda x: x, False), (torch.nn.Module().to("cpu"), True)]) +def test_neptune_saver(model, serializable): mock_logger = MagicMock(spec=NeptuneLogger) - mock_logger.log_artifact = MagicMock() + mock_logger.upload = MagicMock() - to_save_non_serializable = {"model": lambda x: x} + to_save_non_serializable = {"model": model} saver = NeptuneSaver(mock_logger) fname = "test.pt" @@ -527,28 +520,15 @@ def test_neptune_saver_non_serializable(): except Exception: pass - assert mock_logger.log_artifact.call_count == 0 - - -@pytest.mark.parametrize("no_site_packages", ["neptune"], indirect=True) -def test_no_neptune_client(no_site_packages): - - with pytest.raises(ModuleNotFoundError, match=r"This contrib module requires neptune-client to be installed."): - NeptuneLogger() + assert mock_logger["model"].upload.call_count == int(serializable) -@pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): +def test_logs_version(): + from ignite import __version__ + from ignite.contrib.handlers.neptune_logger import _INTEGRATION_VERSION_KEY - device = idist.device() - _test_neptune_saver_integration(device) - - -@pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - - device = idist.device() - _test_neptune_saver_integration(device) + logger = NeptuneLogger( + project="tests/dry-run", + mode="debug", + ) + assert logger[_INTEGRATION_VERSION_KEY].fetch() == __version__ diff --git a/tests/ignite/contrib/handlers/test_tqdm_logger.py b/tests/ignite/contrib/handlers/test_tqdm_logger.py index 23068e85b3b1..4530640ab19e 100644 --- a/tests/ignite/contrib/handlers/test_tqdm_logger.py +++ b/tests/ignite/contrib/handlers/test_tqdm_logger.py @@ -9,7 +9,7 @@ import torch from packaging.version import Version -from ignite.contrib.handlers import ProgressBar +from ignite.contrib.handlers import CustomPeriodicEvent, ProgressBar from ignite.engine import Engine, Events from ignite.handlers import TerminateOnNan from ignite.metrics import RunningAverage @@ -475,6 +475,17 @@ def test_pbar_wrong_events_order(): pbar.attach(engine, event_name=Events.ITERATION_STARTED, closing_event_name=Events.EPOCH_COMPLETED(every=10)) +def test_pbar_on_custom_events(capsys): + + engine = Engine(update_fn) + pbar = ProgressBar() + with pytest.warns(DeprecationWarning, match="CustomPeriodicEvent is deprecated"): + cpe = CustomPeriodicEvent(n_iterations=15) + + with pytest.raises(ValueError, match=r"not in allowed events for this engine"): + pbar.attach(engine, event_name=cpe.Events.ITERATIONS_15_COMPLETED, closing_event_name=Events.EPOCH_COMPLETED) + + def test_pbar_with_nan_input(): def update(engine, batch): x = batch diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py index 90126223c86d..65498f0afe59 100644 --- a/tests/ignite/distributed/utils/__init__.py +++ b/tests/ignite/distributed/utils/__init__.py @@ -350,7 +350,7 @@ def _test(barrier): @idist.one_rank_only(rank=rank, with_barrier=barrier) def initialize(): - value.data = torch.tensor(100).to(device) + value.add_(torch.tensor(100).to(device)) initialize() diff --git a/tests/ignite/engine/test_create_supervised.py b/tests/ignite/engine/test_create_supervised.py index 9b7cf4067f93..fa9681df81e0 100644 --- a/tests/ignite/engine/test_create_supervised.py +++ b/tests/ignite/engine/test_create_supervised.py @@ -220,8 +220,18 @@ def _default_create_supervised_evaluator( evaluator_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, + with_model_transform: bool = False, ): - model = DummyModel() + if with_model_transform: + + def get_first_element(output): + return output[0] + + model = DummyModel(output_as_list=True) + model_transform = get_first_element + else: + model = DummyModel() + model_transform = None if model_device: model.to(model_device) @@ -232,7 +242,12 @@ def _default_create_supervised_evaluator( example_input = torch.randn(1, 1) model = torch.jit.trace(model, example_input) - evaluator = create_supervised_evaluator(model, device=evaluator_device, amp_mode=amp_mode) + evaluator = create_supervised_evaluator( + model, + device=evaluator_device, + amp_mode=amp_mode, + model_transform=model_transform if model_transform is not None else lambda x: x, + ) assert model.fc.weight.data[0, 0].item() == approx(0.0) @@ -244,9 +259,14 @@ def _test_create_supervised_evaluator( evaluator_device: Optional[str] = None, trace: bool = False, amp_mode: str = None, + with_model_transform: bool = False, ): model, evaluator = _default_create_supervised_evaluator( - model_device=model_device, evaluator_device=evaluator_device, trace=trace, amp_mode=amp_mode + model_device=model_device, + evaluator_device=evaluator_device, + trace=trace, + amp_mode=amp_mode, + with_model_transform=with_model_transform, ) x = torch.tensor([[1.0], [2.0]]) y = torch.tensor([[3.0], [5.0]]) diff --git a/tests/ignite/engine/test_custom_events.py b/tests/ignite/engine/test_custom_events.py index 3a19904a45f9..ef8be48842d0 100644 --- a/tests/ignite/engine/test_custom_events.py +++ b/tests/ignite/engine/test_custom_events.py @@ -6,7 +6,24 @@ import ignite.distributed as idist from ignite.engine import Engine, Events -from ignite.engine.events import CallableEventWithFilter, EventEnum, EventsList +from ignite.engine.events import CallableEvents, CallableEventWithFilter, EventEnum, EventsList + + +def test_deprecated_callable_events_class(): + engine = Engine(lambda engine, batch: 0) + + with pytest.warns(DeprecationWarning, match=r"Class ignite\.engine\.events\.CallableEvents is deprecated"): + + class CustomEvents(CallableEvents, Enum): + TEST_EVENT = "test_event" + + def __new__(cls, value: str) -> "CallableEvents": + obj = CallableEvents.__new__(cls) + obj._value_ = value + return obj + + with pytest.raises(TypeError, match=r"Value at \d of event_names should be a str or EventEnum"): + engine.register_events(*CustomEvents) def test_custom_events(): diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 994eb49f72bb..c37aa95ada6d 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -520,6 +520,9 @@ def test_run_asserts(self): with pytest.raises(ValueError, match=r"Input data has zero size. Please provide non-empty data"): engine.run([]) + with pytest.warns(UserWarning, match="Argument seed is deprecated"): + engine.run([0, 1, 2, 3, 4], seed=1234) + def test_state_get_event_attrib_value(self): state = State() state.iteration = 10 @@ -1026,47 +1029,6 @@ def switch_dataloader(): trainer.run(data1, max_epochs=10) - def test_run_with_max_iters(self): - max_iters = 8 - engine = Engine(lambda e, b: 1) - engine.run([0] * 20, max_iters=max_iters) - assert engine.state.iteration == max_iters - assert engine.state.max_iters == max_iters - - def test_run_with_max_iters_greater_than_epoch_length(self): - max_iters = 73 - engine = Engine(lambda e, b: 1) - engine.run([0] * 20, max_iters=max_iters) - assert engine.state.iteration == max_iters - - def test_run_with_invalid_max_iters_and_max_epoch(self): - max_iters = 12 - max_epochs = 2 - engine = Engine(lambda e, b: 1) - with pytest.raises( - ValueError, - match=r"Arguments max_iters and max_epochs are mutually exclusive." - "Please provide only max_epochs or max_iters.", - ): - engine.run([0] * 20, max_iters=max_iters, max_epochs=max_epochs) - - def test_epoch_events_fired_max_iters(self): - max_iters = 32 - engine = Engine(lambda e, b: 1) - - @engine.on(Events.EPOCH_COMPLETED) - def fired_event(engine): - assert engine.state.iteration % engine.state.epoch_length == 0 - - engine.run([0] * 10, max_iters=max_iters) - - def test_is_done_with_max_iters(self): - state = State(iteration=100, epoch=1, max_epochs=3, epoch_length=100, max_iters=250) - assert not Engine._is_done(state) - - state = State(iteration=250, epoch=1, max_epochs=3, epoch_length=100, max_iters=250) - assert Engine._is_done(state) - @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_batch_is_released_before_new_one_is_loaded_on_cuda(self): torch.cuda.empty_cache() diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py index bf48e07399d2..297062d1aed3 100644 --- a/tests/ignite/handlers/test_checkpoint.py +++ b/tests/ignite/handlers/test_checkpoint.py @@ -63,6 +63,9 @@ def test_checkpoint_wrong_input(): with pytest.raises(TypeError, match=r"global_step_transform should be a function."): Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", global_step_transform=123) + with pytest.warns(UserWarning, match=r"Argument archived is deprecated"): + Checkpoint(to_save, lambda x: x, score_function=lambda e: 123, score_name="acc", archived=True) + with pytest.raises(ValueError, match=r"Cannot have key 'checkpointer' if `include_self` is True"): Checkpoint({"checkpointer": model}, lambda x: x, include_self=True) @@ -550,12 +553,21 @@ def test_model_checkpoint_args_validation(dirname): with pytest.raises(ValueError, match=r"with extension '.pt' are already present "): ModelCheckpoint(nonempty, _PREFIX) + with pytest.raises(ValueError, match=r"Argument save_interval is deprecated and should be None"): + ModelCheckpoint(existing, _PREFIX, save_interval=42) + with pytest.raises(ValueError, match=r"Directory path '\S+' is not found"): ModelCheckpoint(dirname / "non_existing_dir", _PREFIX, create_dir=False) + with pytest.raises(ValueError, match=r"Argument save_as_state_dict is deprecated and should be True"): + ModelCheckpoint(existing, _PREFIX, create_dir=False, save_as_state_dict=False) + with pytest.raises(TypeError, match=r"global_step_transform should be a function"): ModelCheckpoint(existing, _PREFIX, create_dir=False, global_step_transform=1234) + with pytest.warns(UserWarning, match=r"Argument archived is deprecated"): + ModelCheckpoint(existing, _PREFIX, create_dir=False, archived=True) + h = ModelCheckpoint(dirname, _PREFIX, create_dir=False) assert h.last_checkpoint is None with pytest.raises(RuntimeError, match=r"No objects to checkpoint found."): diff --git a/tests/ignite/handlers/test_lr_finder.py b/tests/ignite/handlers/test_lr_finder.py index c966c8c3f1dd..f2f488acfe82 100644 --- a/tests/ignite/handlers/test_lr_finder.py +++ b/tests/ignite/handlers/test_lr_finder.py @@ -348,7 +348,7 @@ def test_num_iter_is_not_enough(lr_finder, to_save, dummy_engine, dataloader): trainer_with_finder.run(dataloader) assert_output_sizes(lr_finder, dummy_engine) assert dummy_engine.state.iteration != len(dataloader) - assert dummy_engine.state.iteration == 150 + assert dummy_engine.state.iteration == 150 + 1 def test_detach_terminates(lr_finder, to_save, dummy_engine, dataloader):